Support polars as DataFrame engine (#67)

and adjust all tests and doc scripts
phurwicz · Apr 28, 2023 · ff7a16e · ff7a16e
1 parent 0043e6d
commit ff7a16e
Show file tree

Hide file tree

Showing 45 changed files with 1,778 additions and 340 deletions.
diff --git a/.github/workflows/cross-os-conda-build.yml b/.github/workflows/cross-os-conda-build.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.8', '3.10']
         os: [ubuntu-latest, macos-latest, windows-latest]
 
     steps:

diff --git a/.github/workflows/cross-os-install-source.yml b/.github/workflows/cross-os-install-source.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.8', '3.10']
         os: [ubuntu-latest, macos-latest, windows-latest]
 
     steps:

diff --git a/.github/workflows/cross-os-source-test.yml b/.github/workflows/cross-os-source-test.yml
@@ -25,8 +25,15 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
 
-    - name: Test with Tox
+    - name: Get dependencies
       run: |
         pip install --upgrade pip
         pip install --upgrade tox tox-gh-actions
+
+    - name: Test - default config
+      run: |
         tox -e test_api
+
+    - name: Test - alt config 1
+      run: |
+        tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini
diff --git a/.github/workflows/doc-auto-notebook.yml b/.github/workflows/doc-auto-notebook.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: ['3.8']
+        python-version: ['3.9']
         os: [ubuntu-latest]
 
     steps:

diff --git a/.github/workflows/doc-script-test.yml b/.github/workflows/doc-script-test.yml
@@ -31,7 +31,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.8', '3.9']
+        python-version: ['3.9']
         os: [ubuntu-latest]
 
     steps:

diff --git a/.github/workflows/quick-source-test.yml b/.github/workflows/quick-source-test.yml
@@ -47,12 +47,19 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-${{ runner.python-version }}-tox-env-
 
-    - name: Test and make coverage report
+    - name: Get dependencies
       run: |
         pip install --upgrade pip
         pip install --upgrade tox tox-gh-actions
+
+    - name: Test - default config
+      run: |
         tox -e test_api
 
+    - name: Test - alt config 1
+      run: |
+        tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini
+
     - name: Codacy Coverage Reporter
       uses: codacy/codacy-coverage-reporter-action@master
       if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'push' }}

diff --git a/docs/snippets/py/g0-4a-reduction-print.txt b/docs/snippets/py/g0-4a-reduction-print.txt
@@ -1 +1 @@
-dataset.dfs["raw"].head(5)
+dataset.dfs["raw"]().head(5)
diff --git a/docs/snippets/py/t0-0a-dataset-text-print.txt b/docs/snippets/py/t0-0a-dataset-text-print.txt
@@ -1,2 +1,2 @@
 # each subset can be accessed as its own DataFrame
-dataset.dfs["raw"].head(5)
+dataset.dfs["raw"]().head(5)
diff --git a/docs/snippets/py/t0-1a-vectorizer-print.txt b/docs/snippets/py/t0-1a-vectorizer-print.txt
@@ -1,4 +1,4 @@
-text = dataset.dfs["raw"].loc[0, "text"]
+text = dataset.dfs["raw"]().loc[0, "text"]
 vec = vectorizer(text)
 print(f"Text: {text}")
 print(f"Vector shape: {vec.shape}")
diff --git a/docs/snippets/py/t0-2a-reduction-print.txt b/docs/snippets/py/t0-2a-reduction-print.txt
@@ -1,2 +1,2 @@
 # what we did adds 'embed_2d_0' and 'embed_2d_1' columns to the DataFrames in dataset.dfs
-dataset.dfs["raw"].head(5)
+dataset.dfs["raw"]().head(5)
diff --git a/docs/snippets/py/t3-2-dataset-selection-table.txt b/docs/snippets/py/t3-2-dataset-selection-table.txt
@@ -1,3 +1,3 @@
-dataset._callback_update_selection(dataset.dfs["raw"].loc[:10])
+dataset._callback_update_selection(dataset.dfs["raw"][:10])
 
 show(dataset.sel_table, notebook_url=notebook_url)
diff --git a/hover/__init__.py b/hover/__init__.py
@@ -1,124 +1,145 @@
 """
 Module root where constants get configured.
 """
-import re
+from .config_constants import (
+    ConfigSection,
+    ConfigKey,
+    Validator,
+    Preprocessor,
+)
 from flexmod import AutolockedConfigValue, Config, ConfigIndex
 from bokeh.palettes import Turbo256
 
+
 config = ConfigIndex(
     [
         Config(
-            "io",
+            ConfigSection.IO,
             [
                 AutolockedConfigValue(
-                    "data_save_dir",
+                    ConfigKey.DATA_SAVE_DIR,
                     "The directory path for saving labeled data.",
                     ".",
-                    validation=lambda x: isinstance(x, str),
+                    validation=Validator.is_str,
+                ),
+            ],
+        ),
+        Config(
+            ConfigSection.BACKEND,
+            [
+                AutolockedConfigValue(
+                    ConfigKey.DATAFRAME_LIBRARY,
+                    "The library to use for internal dataframes. Must be 'pandas' or 'polars'.",
+                    "pandas",
+                    validation=Validator.is_supported_dataframe_library,
+                    preprocessor=Preprocessor.lower,
                 ),
             ],
         ),
         Config(
-            "visual",
+            ConfigSection.VISUAL,
             [
                 AutolockedConfigValue(
-                    "abstain_hexcolor",
+                    ConfigKey.ABSTAIN_HEXCOLOR,
                     "Hex code of RGB color.",
                     "#dcdcdc",
-                    validation=lambda x: bool(re.match(r"^\#[0-9a-fA-F]{6}$", x)),
+                    validation=Validator.is_hex_color,
+                    preprocessor=Preprocessor.lower,
                 ),
                 AutolockedConfigValue(
-                    "bokeh_palette",
+                    ConfigKey.BOKEH_PALETTE,
                     "The bokeh color palette to use for plotting. This should be a list of hex color codes.",
                     Turbo256,
-                    validation=lambda x: hasattr(x, "__iter__"),
+                    validation=Validator.is_iterable_of_hex_color,
                 ),
                 AutolockedConfigValue(
-                    "bokeh_palette_usage",
+                    ConfigKey.BOKEH_PALETTE_USAGE,
                     "Specify how colors from the palette should be chosen when there are fewer categories than colors. This needs to be 'iterate' or 'linspace'",
                     "linspace",
-                    validation=lambda x: x in ["iterate", "linspace"],
+                    validation=Validator.is_supported_traversal_mode,
+                    preprocessor=Preprocessor.lower,
                 ),
                 AutolockedConfigValue(
-                    "table_img_style",
+                    ConfigKey.TABLE_IMG_STYLE,
                     "HTML style of images shown in selection tables.",
                     "max-height: 100%; max-width: 100%; object-fit: contain",
-                    preprocessor=lambda x: re.sub(r"(^[\'\"]|[\'\"]$)", "", x),
+                    preprocessor=Preprocessor.remove_quote_at_ends,
                 ),
                 AutolockedConfigValue(
-                    "tooltip_img_style",
+                    ConfigKey.TOOLTIP_IMG_STYLE,
                     "HTML style of images shown in mouse-over-data-point tooltips.",
                     "float: left; margin: 2px 2px 2px 2px; width: 60px; height: 60px;",
-                    preprocessor=lambda x: re.sub(r"(^[\'\"]|[\'\"]$)", "", x),
+                    preprocessor=Preprocessor.remove_quote_at_ends,
                 ),
             ],
         ),
         Config(
-            "data.embedding",
+            ConfigSection.DATA_EMBEDDING,
             [
                 AutolockedConfigValue(
-                    "default_reduction_method",
+                    ConfigKey.DEFAULT_REDUCTION_METHOD,
                     "Default method for dimensionality reduction. Currently either 'umap' or 'ivis'.",
                     "umap",
-                    validation=lambda x: x in ["umap", "ivis"],
+                    validation=Validator.is_supported_dimensionality_reduction,
+                    preprocessor=Preprocessor.lower,
                 ),
             ],
         ),
         Config(
-            "data.columns",
+            ConfigSection.DATA_COLUMNS,
             [
                 AutolockedConfigValue(
-                    "encoded_label_key",
+                    ConfigKey.ENCODED_LABEL_KEY,
                     "The column name for the encoded label.",
                     "label_encoded",
-                    validation=lambda x: isinstance(x, str),
+                    validation=Validator.is_str,
                 ),
                 AutolockedConfigValue(
-                    "dataset_subset_field",
+                    ConfigKey.DATASET_SUBSET_FIELD,
                     "The column name for dataset subsets.",
                     "SUBSET",
-                    validation=lambda x: isinstance(x, str),
+                    validation=Validator.is_str,
                 ),
                 AutolockedConfigValue(
-                    "embedding_field_prefix",
+                    ConfigKey.EMBEDDING_FIELD_PREFIX,
                     "The prefix of column names for embedding coordinates.",
                     "embed_",
-                    validation=lambda x: isinstance(x, str),
+                    validation=Validator.is_str,
                 ),
                 AutolockedConfigValue(
-                    "source_color_field",
+                    ConfigKey.SOURCE_COLOR_FIELD,
                     "The column name for plotted data point color.",
                     "__COLOR__",
-                    validation=lambda x: isinstance(x, str),
+                    validation=Validator.is_str,
                 ),
                 AutolockedConfigValue(
-                    "source_alpha_field",
+                    ConfigKey.SOURCE_ALPHA_FIELD,
                     "The column name for plotted data point color alpha (opacity).",
                     "__ALPHA__",
-                    validation=lambda x: isinstance(x, str),
+                    validation=Validator.is_str,
                 ),
                 AutolockedConfigValue(
-                    "search_score_field",
+                    ConfigKey.SEARCH_SCORE_FIELD,
                     "The column name for data points' score from search widgets.",
                     "__SEARCH_SCORE__",
-                    validation=lambda x: isinstance(x, str),
+                    validation=Validator.is_str,
                 ),
             ],
         ),
         Config(
-            "data.values",
+            ConfigSection.DATA_VALUES,
             [
                 AutolockedConfigValue(
-                    "abstain_decoded",
+                    ConfigKey.ABSTAIN_DECODED,
                     "The placeholder label indicating 'no label yet'.",
                     "ABSTAIN",
-                    validation=lambda x: isinstance(x, str),
+                    validation=Validator.is_str,
                 ),
                 AutolockedConfigValue(
-                    "abstain_encoded",
+                    ConfigKey.ABSTAIN_ENCODED,
                     "The encoded value of 'no label yet' which should almost always be -1, never 0 or positive.",
                     -1,
-                    validation=lambda x: isinstance(x, int) and x < 0,
+                    validation=Validator.is_negative_int,
                 ),
             ],
         ),

diff --git a/hover/config_constants.py b/hover/config_constants.py
@@ -0,0 +1,78 @@
+import re
+
+
+class ConfigSection:
+    IO = "io"
+    BACKEND = "backend"
+    VISUAL = "visual"
+    DATA_EMBEDDING = "data.embedding"
+    DATA_COLUMNS = "data.columns"
+    DATA_VALUES = "data.values"
+
+
+class ConfigKey:
+    DATA_SAVE_DIR = "data_save_dir"
+    DATAFRAME_LIBRARY = "dataframe_library"
+    ABSTAIN_HEXCOLOR = "abstain_hexcolor"
+    BOKEH_PALETTE = "bokeh_palette"
+    BOKEH_PALETTE_USAGE = "bokeh_palette_usage"
+    TABLE_IMG_STYLE = "table_img_style"
+    TOOLTIP_IMG_STYLE = "tooltip_img_style"
+    DEFAULT_REDUCTION_METHOD = "default_reduction_method"
+    ENCODED_LABEL_KEY = "encoded_label_key"
+    DATASET_SUBSET_FIELD = "dataset_subset_field"
+    EMBEDDING_FIELD_PREFIX = "embedding_field_prefix"
+    SOURCE_COLOR_FIELD = "source_color_field"
+    SOURCE_ALPHA_FIELD = "source_alpha_field"
+    SEARCH_SCORE_FIELD = "search_score_field"
+    ABSTAIN_DECODED = "abstain_decoded"
+    ABSTAIN_ENCODED = "abstain_encoded"
+
+
+class Validator:
+    @staticmethod
+    def is_hex_color(x):
+        return bool(re.match(r"^\#[0-9a-fA-F]{6}$", x))
+
+    @staticmethod
+    def is_iterable(x):
+        return hasattr(x, "__iter__")
+
+    @staticmethod
+    def is_iterable_of_hex_color(x):
+        if not Validator.is_iterable(x):
+            return False
+        for i in x:
+            if not Validator.is_hex_color(i):
+                return False
+        return True
+
+    @staticmethod
+    def is_supported_dataframe_library(x):
+        return x in ["pandas", "polars"]
+
+    @staticmethod
+    def is_supported_dimensionality_reduction(x):
+        return x in ["umap", "ivis"]
+
+    @staticmethod
+    def is_supported_traversal_mode(x):
+        return x in ["iterate", "linspace"]
+
+    @staticmethod
+    def is_str(x):
+        return isinstance(x, str)
+
+    @staticmethod
+    def is_negative_int(x):
+        return isinstance(x, int) and x < 0
+
+
+class Preprocessor:
+    @staticmethod
+    def remove_quote_at_ends(x):
+        return re.sub(r"(^[\'\"]|[\'\"]$)", "", x)
+
+    @staticmethod
+    def lower(x):
+        return x.lower()