Supporting dataframe with integer columns (#203)

* bugfix for describe and convert_dtypes * added back metadata series test * black * default to pandas display when df.dtypes printed * various fixes to support int columns
lux-org · Jan 7, 2021 · 3393b9f · 3393b9f
1 parent 459b4bf
commit 3393b9f
Show file tree

Hide file tree

Showing 18 changed files with 172 additions and 134 deletions.
diff --git a/lux/action/enhance.py b/lux/action/enhance.py
@@ -37,8 +37,8 @@ def enhance(ldf):
     # Collect variables that already exist in the intent
     attr_specs = list(filter(lambda x: x.value == "" and x.attribute != "Record", ldf._intent))
     fltr_str = [fltr.attribute + fltr.filter_op + str(fltr.value) for fltr in filters]
-    attr_str = [clause.attribute for clause in attr_specs]
-    intended_attrs = '<p class="highlight-intent">' + ", ".join(attr_str + fltr_str) + "</p>"
+    attr_str = [str(clause.attribute) for clause in attr_specs]
+    intended_attrs = f'<p class="highlight-intent">{", ".join(attr_str + fltr_str)}</p>'
     if len(attr_specs) == 1:
         recommendation = {
             "action": "Enhance",

diff --git a/lux/action/filter.py b/lux/action/filter.py
@@ -91,7 +91,7 @@ def get_complementary_ops(fltr_op):
     else:
         intended_attrs = ", ".join(
             [
-                clause.attribute
+                str(clause.attribute)
                 for clause in ldf._intent
                 if clause.value == "" and clause.attribute != "Record"
             ]

diff --git a/lux/action/generalize.py b/lux/action/generalize.py
@@ -42,8 +42,8 @@ def generalize(ldf):
     filters = utils.get_filter_specs(ldf._intent)
 
     fltr_str = [fltr.attribute + fltr.filter_op + str(fltr.value) for fltr in filters]
-    attr_str = [clause.attribute for clause in attributes]
-    intended_attrs = '<p class="highlight-intent">' + ", ".join(attr_str + fltr_str) + "</p>"
+    attr_str = [str(clause.attribute) for clause in attributes]
+    intended_attrs = f'<p class="highlight-intent">{", ".join(attr_str + fltr_str)}</p>'
 
     recommendation = {
         "action": "Generalize",
@@ -66,7 +66,7 @@ def generalize(ldf):
                         temp_vis.remove_column_from_spec(column, remove_first=True)
                         excluded_columns.append(column)
                         output.append(temp_vis)
-            elif type(columns) == str:
+            else:
                 if columns not in excluded_columns:
                     temp_vis = Vis(ldf.copy_intent(), score=1)
                     temp_vis.remove_column_from_spec(columns, remove_first=True)

diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py
@@ -90,11 +90,11 @@ def execute(vislist: VisList, ldf: LuxDataFrame):
             # Select relevant data based on attribute information
             attributes = set([])
             for clause in vis._inferred_intent:
-                if clause.attribute:
-                    if clause.attribute != "Record":
-                        attributes.add(clause.attribute)
+                if clause.attribute != "Record":
+                    attributes.add(clause.attribute)
             # TODO: Add some type of cap size on Nrows ?
             vis._vis_data = vis.data[list(attributes)]
+
             if vis.mark == "bar" or vis.mark == "line":
                 PandasExecutor.execute_aggregate(vis, isFiltered=filter_executed)
             elif vis.mark == "histogram":

diff --git a/lux/processor/Parser.py b/lux/processor/Parser.py
@@ -46,7 +46,6 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]:
             )
         import re
 
-        # intent = ldf.get_context()
         new_context = []
         # checks for and converts users' string inputs into lux specifications
         for clause in intent:
@@ -59,37 +58,40 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]:
                         valid_values.append(v)
                 temp_spec = Clause(attribute=valid_values)
                 new_context.append(temp_spec)
-            elif isinstance(clause, str):
-                # case where user specifies a filter
-                if "=" in clause:
-                    eqInd = clause.index("=")
-                    var = clause[0:eqInd]
-                    if "|" in clause:
-                        values = clause[eqInd + 1 :].split("|")
-                        for v in values:
-                            # if v in ldf.unique_values[var]: #TODO: Move validation check to Validator
-                            valid_values.append(v)
+            elif isinstance(clause, Clause):
+                new_context.append(clause)
+            else:
+                if isinstance(clause, str):
+                    # case where user specifies a filter
+                    if "=" in clause:
+                        eqInd = clause.index("=")
+                        var = clause[0:eqInd]
+                        if "|" in clause:
+                            values = clause[eqInd + 1 :].split("|")
+                            for v in values:
+                                # if v in ldf.unique_values[var]: #TODO: Move validation check to Validator
+                                valid_values.append(v)
+                        else:
+                            valid_values = clause[eqInd + 1 :]
+                        # if var in list(ldf.columns): #TODO: Move validation check to Validator
+                        temp_spec = Clause(attribute=var, filter_op="=", value=valid_values)
+                        new_context.append(temp_spec)
+                    # case where user specifies a variable
                     else:
-                        valid_values = clause[eqInd + 1 :]
-                    # if var in list(ldf.columns): #TODO: Move validation check to Validator
-                    temp_spec = Clause(attribute=var, filter_op="=", value=valid_values)
-                    new_context.append(temp_spec)
-                # case where user specifies a variable
+                        if "|" in clause:
+                            values = clause.split("|")
+                            for v in values:
+                                # if v in list(ldf.columns): #TODO: Move validation check to Validator
+                                valid_values.append(v)
+                        else:
+                            valid_values = clause
+                        temp_spec = Clause(attribute=valid_values)
+                        new_context.append(temp_spec)
                 else:
-                    if "|" in clause:
-                        values = clause.split("|")
-                        for v in values:
-                            # if v in list(ldf.columns): #TODO: Move validation check to Validator
-                            valid_values.append(v)
-                    else:
-                        valid_values = clause
-                    temp_spec = Clause(attribute=valid_values)
+                    temp_spec = Clause(attribute=clause)
                     new_context.append(temp_spec)
-            elif type(clause) is Clause:
-                new_context.append(clause)
-        intent = new_context
-        # ldf._intent = new_context
 
+        intent = new_context
         for clause in intent:
             if clause.description:
                 # TODO: Move validation check to Validator
@@ -112,4 +114,3 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]:
                 else:  # then it is probably a value
                     clause.value = clause.description
         return intent
-        # ldf._intent = intent
diff --git a/lux/processor/Validator.py b/lux/processor/Validator.py
@@ -57,9 +57,7 @@ def validate_intent(intent: List[Clause], ldf: LuxDataFrame) -> None:
 
         def validate_clause(clause):
             warn_msg = ""
-            if not (
-                (clause.attribute and clause.attribute == "?") or (clause.value and clause.value == "?")
-            ):
+            if not (clause.attribute == "?" or clause.value == "?" or clause.attribute == ""):
                 if isinstance(clause.attribute, list):
                     for attr in clause.attribute:
                         if attr not in list(ldf.columns):
@@ -69,7 +67,9 @@ def validate_clause(clause):
                 else:
                     if clause.attribute != "Record":
                         # we don't value check datetime since datetime can take filter values that don't exactly match the exact TimeStamp representation
-                        if clause.attribute and not is_datetime_string(clause.attribute):
+                        if isinstance(clause.attribute, str) and not is_datetime_string(
+                            clause.attribute
+                        ):
                             if not clause.attribute in list(ldf.columns):
                                 search_val = clause.attribute
                                 match_attr = False
@@ -80,9 +80,7 @@ def validate_clause(clause):
                                     warn_msg = f"\n- The input '{search_val}' looks like a value that belongs to the '{match_attr}' attribute. \n  Please specify the value fully, as something like {match_attr}={search_val}."
                                 else:
                                     warn_msg = f"\n- The input attribute '{clause.attribute}' does not exist in the DataFrame. \n  Please check your input intent for typos."
-                        if clause.value and clause.attribute and clause.filter_op == "=":
-                            import math
-
+                        if clause.value != "" and clause.attribute != "" and clause.filter_op == "=":
                             # Skip check for NaN filter values
                             if not lux.utils.utils.like_nan(clause.value):
                                 series = ldf[clause.attribute]

diff --git a/lux/utils/utils.py b/lux/utils/utils.py
@@ -57,16 +57,17 @@ def check_import_lux_widget():
 
 
 def get_agg_title(clause):
+    attr = str(clause.attribute)
     if clause.aggregation is None:
-        if len(clause.attribute) > 25:
-            return clause.attribute[:15] + "..." + clause.attribute[-10:]
-        return f"{clause.attribute}"
-    elif clause.attribute == "Record":
+        if len(attr) > 25:
+            return attr[:15] + "..." + attr[-10:]
+        return f"{attr}"
+    elif attr == "Record":
         return f"Number of Records"
     else:
-        if len(clause.attribute) > 15:
-            return f"{clause._aggregation_name.capitalize()} of {clause.attribute[:15]}..."
-        return f"{clause._aggregation_name.capitalize()} of {clause.attribute}"
+        if len(attr) > 15:
+            return f"{clause._aggregation_name.capitalize()} of {attr[:15]}..."
+        return f"{clause._aggregation_name.capitalize()} of {attr}"
 
 
 def check_if_id_like(df, attribute):

diff --git a/lux/vis/Clause.py b/lux/vis/Clause.py
@@ -116,7 +116,7 @@ def to_string(self):
         if isinstance(self.attribute, list):
             clauseStr = "|".join(self.attribute)
         elif self.value == "":
-            clauseStr = self.attribute
+            clauseStr = str(self.attribute)
         else:
             clauseStr = f"{self.attribute}{self.filter_op}{self.value}"
         return clauseStr
@@ -126,23 +126,23 @@ def __repr__(self):
         if self.description != "":
             attributes.append(f"         description: {self.description}")
         if self.channel != "":
-            attributes.append("         channel: " + self.channel)
-        if len(self.attribute) != 0:
-            attributes.append("         attribute: " + str(self.attribute))
+            attributes.append(f"         channel: {self.channel}")
+        if self.attribute != "":
+            attributes.append(f"         attribute: {str(self.attribute)}")
         if self.filter_op != "=":
             attributes.append(f"         filter_op: {str(self.filter_op)}")
         if self.aggregation != "" and self.aggregation is not None:
             attributes.append("         aggregation: " + self._aggregation_name)
         if self.value != "" or len(self.value) != 0:
-            attributes.append("         value: " + str(self.value))
+            attributes.append(f"         value: {str(self.value)}")
         if self.data_model != "":
-            attributes.append("         data_model: " + self.data_model)
+            attributes.append(f"         data_model: {self.data_model}")
         if len(self.data_type) != 0:
-            attributes.append("         data_type: " + str(self.data_type))
-        if self.bin_size != None:
-            attributes.append("         bin_size: " + str(self.bin_size))
+            attributes.append(f"         data_type: {str(self.data_type)}")
+        if self.bin_size != 0:
+            attributes.append(f"         bin_size: {str(self.bin_size)}")
         if len(self.exclude) != 0:
-            attributes.append("         exclude: " + str(self.exclude))
+            attributes.append(f"         exclude: {str(self.exclude)}")
         attributes[0] = "<Clause" + attributes[0][7:]
         attributes[len(attributes) - 1] += " >"
         return ",\n".join(attributes)
diff --git a/lux/vis/Vis.py b/lux/vis/Vis.py
@@ -49,9 +49,9 @@ def __repr__(self):
                 if hasattr(clause, "attribute"):
                     if clause.attribute != "":
                         if clause.aggregation != "" and clause.aggregation is not None:
-                            attribute = clause._aggregation_name.upper() + "(" + clause.attribute + ")"
+                            attribute = f"{clause._aggregation_name.upper()}({clause.attribute})"
                         elif clause.bin_size > 0:
-                            attribute = "BIN(" + clause.attribute + ")"
+                            attribute = f"BIN({clause.attribute})"
                         else:
                             attribute = clause.attribute
                         if clause.channel == "x":
@@ -64,7 +64,7 @@ def __repr__(self):
             channels.extend(additional_channels)
             str_channels = ""
             for channel in channels:
-                str_channels += channel[0] + ": " + channel[1] + ", "
+                str_channels += f"{channel[0]}: {channel[1]}, "
 
             if filter_intents:
                 return f"<Vis  ({str_channels[:-2]} -- [{filter_intents.attribute}{filter_intents.filter_op}{filter_intents.value}]) mark: {self._mark}, score: {self.score} >"
@@ -324,5 +324,8 @@ def check_not_vislist_intent(self):
 
         for i in range(len(self._intent)):
             clause = self._intent[i]
-            if type(clause) != Clause and ("|" in clause or type(clause) == list or "?" in clause):
+            if isinstance(clause, str):
+                if "|" in clause or "?" in clause:
+                    raise TypeError(syntaxMsg)
+            if isinstance(clause, list):
                 raise TypeError(syntaxMsg)
diff --git a/lux/vis/VisList.py b/lux/vis/VisList.py
@@ -133,16 +133,17 @@ def __repr__(self):
         for vis in self._collection:
             filter_intents = None
             for clause in vis._inferred_intent:
+                attr = str(clause.attribute)
                 if clause.value != "":
                     filter_intents = clause
 
                 if clause.aggregation != "" and clause.aggregation is not None:
-                    attribute = clause._aggregation_name.upper() + "(" + clause.attribute + ")"
+                    attribute = clause._aggregation_name.upper() + f"({attr})"
                 elif clause.bin_size > 0:
-                    attribute = "BIN(" + clause.attribute + ")"
+                    attribute = f"BIN({attr})"
                 else:
-                    attribute = clause.attribute
-
+                    attribute = attr
+                attribute = str(attribute)
                 if clause.channel == "x" and len(x_channel) < len(attribute):
                     x_channel = attribute
                 if clause.channel == "y" and len(y_channel) < len(attribute):
@@ -151,9 +152,9 @@ def __repr__(self):
                 largest_mark = len(vis.mark)
             if (
                 filter_intents
-                and len(str(filter_intents.value)) + len(filter_intents.attribute) > largest_filter
+                and len(str(filter_intents.value)) + len(str(filter_intents.attribute)) > largest_filter
             ):
-                largest_filter = len(str(filter_intents.value)) + len(filter_intents.attribute)
+                largest_filter = len(str(filter_intents.value)) + len(str(filter_intents.attribute))
         vis_repr = []
         largest_x_length = len(x_channel)
         largest_y_length = len(y_channel)
@@ -164,16 +165,16 @@ def __repr__(self):
             y_channel = ""
             additional_channels = []
             for clause in vis._inferred_intent:
+                attr = str(clause.attribute)
                 if clause.value != "":
                     filter_intents = clause
 
                 if clause.aggregation != "" and clause.aggregation is not None and vis.mark != "scatter":
-                    attribute = clause._aggregation_name.upper() + "(" + clause.attribute + ")"
+                    attribute = clause._aggregation_name.upper() + f"({attr})"
                 elif clause.bin_size > 0:
-                    attribute = "BIN(" + clause.attribute + ")"
+                    attribute = f"BIN({attr})"
                 else:
-                    attribute = clause.attribute
-
+                    attribute = attr
                 if clause.channel == "x":
                     x_channel = attribute.ljust(largest_x_length)
                 elif clause.channel == "y":
@@ -197,7 +198,7 @@ def __repr__(self):
             if filter_intents:
                 aligned_filter = (
                     " -- ["
-                    + filter_intents.attribute
+                    + str(filter_intents.attribute)
                     + filter_intents.filter_op
                     + str(filter_intents.value)
                     + "]"

diff --git a/lux/vislib/altair/AltairChart.py b/lux/vislib/altair/AltairChart.py
@@ -87,15 +87,17 @@ def encode_color(self):
                 timeUnit = compute_date_granularity(self.vis.data[color_attr_name])
                 self.chart = self.chart.encode(
                     color=alt.Color(
-                        color_attr_name,
+                        str(color_attr_name),
                         type=color_attr_type,
                         timeUnit=timeUnit,
                         title=color_attr_name,
                     )
                 )
                 self.code += f"chart = chart.encode(color=alt.Color('{color_attr_name}',type='{color_attr_type}',timeUnit='{timeUnit}',title='{color_attr_name}'))"
             else:
-                self.chart = self.chart.encode(color=alt.Color(color_attr_name, type=color_attr_type))
+                self.chart = self.chart.encode(
+                    color=alt.Color(str(color_attr_name), type=color_attr_type)
+                )
                 self.code += f"chart = chart.encode(color=alt.Color('{color_attr_name}',type='{color_attr_type}'))\n"
         elif len(color_attr) > 1:
             raise ValueError(
@@ -111,3 +113,11 @@ def add_title(self):
 
     def initialize_chart(self):
         return NotImplemented
+
+    @classmethod
+    def sanitize_dataframe(self, df):
+        for attr in df.columns:
+            # Altair can not visualize non-string columns
+            # convert all non-string columns in to strings
+            df = df.rename(columns={attr: str(attr)})
+        return df
diff --git a/lux/vislib/altair/AltairRenderer.py b/lux/vislib/altair/AltairRenderer.py
@@ -66,11 +66,12 @@ def create_vis(self, vis, standalone=True):
                     vis.data[attr].iloc[0], pd.Interval
                 ):
                     vis.data[attr] = vis.data[attr].astype(str)
-                if "." in attr:
-                    attr_clause = vis.get_attr_by_attr_name(attr)[0]
-                    # Suppress special character ".", not displayable in Altair
-                    # attr_clause.attribute = attr_clause.attribute.replace(".", "")
-                    vis._vis_data = vis.data.rename(columns={attr: attr.replace(".", "")})
+                if isinstance(attr, str):
+                    if "." in attr:
+                        attr_clause = vis.get_attr_by_attr_name(attr)[0]
+                        # Suppress special character ".", not displayable in Altair
+                        # attr_clause.attribute = attr_clause.attribute.replace(".", "")
+                        vis._vis_data = vis.data.rename(columns={attr: attr.replace(".", "")})
         if vis.mark == "histogram":
             chart = Histogram(vis)
         elif vis.mark == "bar":