Skip to content

Commit

Permalink
[PATCH] Make df.data_type a property that accesses df._data_type (#231)
Browse files Browse the repository at this point in the history
* add protected attribute and make property

* access private var to avoid looping

* fix type
  • Loading branch information
jinimukh committed Jan 18, 2021
1 parent d1b9916 commit 91d6025
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 24 deletions.
18 changes: 12 additions & 6 deletions lux/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class LuxDataFrame(pd.DataFrame):
_metadata = [
"_intent",
"_inferred_intent",
"data_type",
"_data_type",
"unique_values",
"cardinality",
"_rec_info",
Expand Down Expand Up @@ -76,7 +76,7 @@ def __init__(self, *args, **kw):
self._message = Message()
self._pandas_only = False
# Metadata
self.data_type = None
self._data_type = None
self.unique_values = None
self.cardinality = None
self._min_max = None
Expand All @@ -101,6 +101,12 @@ def f(*args, **kwargs):
def history(self):
return self._history

@property
def data_type(self):
if not self._data_type:
self.maintain_metadata()
return self._data_type

def maintain_metadata(self):
# Check that metadata has not yet been computed
if not hasattr(self, "_metadata_fresh") or not self._metadata_fresh:
Expand All @@ -127,7 +133,7 @@ def expire_metadata(self):
Expire all saved metadata to trigger a recomputation the next time the data is required.
"""
self._metadata_fresh = False
self.data_type = None
self._data_type = None
self.unique_values = None
self.cardinality = None
self._min_max = None
Expand Down Expand Up @@ -293,7 +299,7 @@ def compute_SQL_dataset_metadata(self):
self.get_SQL_attributes()
for attr in list(self.columns):
self[attr] = None
self.data_type = {}
self._data_type = {}
#####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this
##### in the initialization and do it just once
self.compute_SQL_data_type()
Expand All @@ -307,7 +313,7 @@ def compute_SQL_stats(self):
self.get_SQL_unique_values()
# self.get_SQL_cardinality()
for attribute in self.columns:
if self.data_type[attribute] == "quantitative":
if self._data_type[attribute] == "quantitative":
self._min_max[attribute] = (
self[attribute].min(),
self[attribute].max(),
Expand Down Expand Up @@ -381,7 +387,7 @@ def compute_SQL_data_type(self):
data_type[attr] = "quantitative"
elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]:
data_type[attr] = "temporal"
self.data_type = data_type
self._data_type = data_type

def _append_rec(self, rec_infolist, recommendations: Dict):
if recommendations["collection"] is not None and len(recommendations["collection"]) > 0:
Expand Down
36 changes: 18 additions & 18 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ def execute_2D_binning(vis: Vis):
############ Metadata: data type, model #############
#######################################################
def compute_dataset_metadata(self, ldf: LuxDataFrame):
ldf.data_type = {}
ldf._data_type = {}
self.compute_data_type(ldf)

def compute_data_type(self, ldf: LuxDataFrame):
Expand All @@ -401,50 +401,50 @@ def compute_data_type(self, ldf: LuxDataFrame):
for attr in list(ldf.columns):
temporal_var_list = ["month", "year", "day", "date", "time", "weekday"]
if is_datetime(ldf[attr]):
ldf.data_type[attr] = "temporal"
ldf._data_type[attr] = "temporal"
elif self._is_datetime_string(ldf[attr]):
ldf.data_type[attr] = "temporal"
ldf._data_type[attr] = "temporal"
elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp):
ldf.data_type[attr] = "temporal"
ldf._data_type[attr] = "temporal"
elif str(attr).lower() in temporal_var_list:
ldf.data_type[attr] = "temporal"
ldf._data_type[attr] = "temporal"
elif self._is_datetime_number(ldf[attr]):
ldf.data_type[attr] = "temporal"
ldf._data_type[attr] = "temporal"
elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
# int columns gets coerced into floats if contain NaN
convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes())
if convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20:
ldf.data_type[attr] = "nominal"
ldf._data_type[attr] = "nominal"
else:
ldf.data_type[attr] = "quantitative"
ldf._data_type[attr] = "quantitative"
elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]):
# See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values
if ldf.pre_aggregated:
if ldf.cardinality[attr] == len(ldf):
ldf.data_type[attr] = "nominal"
ldf._data_type[attr] = "nominal"
if ldf.cardinality[attr] / len(ldf) < 0.4 and ldf.cardinality[attr] < 20:
ldf.data_type[attr] = "nominal"
ldf._data_type[attr] = "nominal"
else:
ldf.data_type[attr] = "quantitative"
ldf._data_type[attr] = "quantitative"
if check_if_id_like(ldf, attr):
ldf.data_type[attr] = "id"
ldf._data_type[attr] = "id"
# Eliminate this clause because a single NaN value can cause the dtype to be object
elif pd.api.types.is_string_dtype(ldf.dtypes[attr]):
if check_if_id_like(ldf, attr):
ldf.data_type[attr] = "id"
ldf._data_type[attr] = "id"
else:
ldf.data_type[attr] = "nominal"
ldf._data_type[attr] = "nominal"
# check if attribute is any type of datetime dtype
elif is_datetime_series(ldf.dtypes[attr]):
ldf.data_type[attr] = "temporal"
ldf._data_type[attr] = "temporal"
else:
ldf.data_type[attr] = "nominal"
ldf._data_type[attr] = "nominal"
if not pd.api.types.is_integer_dtype(ldf.index) and ldf.index.name:
ldf.data_type[ldf.index.name] = "nominal"
ldf._data_type[ldf.index.name] = "nominal"

non_datetime_attrs = []
for attr in ldf.columns:
if ldf.data_type[attr] == "temporal" and not is_datetime(ldf[attr]):
if ldf._data_type[attr] == "temporal" and not is_datetime(ldf[attr]):
non_datetime_attrs.append(attr)
warn_msg = ""
if len(non_datetime_attrs) == 1:
Expand Down

0 comments on commit 91d6025

Please sign in to comment.