Skip to content

Commit

Permalink
Support polars as DataFrame engine (#67)
Browse files Browse the repository at this point in the history
and adjust all tests and doc scripts
  • Loading branch information
phurwicz committed Apr 28, 2023
1 parent 0043e6d commit ff7a16e
Show file tree
Hide file tree
Showing 45 changed files with 1,778 additions and 340 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cross-os-conda-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.8', '3.9', '3.10']
python-version: ['3.8', '3.10']
os: [ubuntu-latest, macos-latest, windows-latest]

steps:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cross-os-install-source.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.8', '3.9', '3.10']
python-version: ['3.8', '3.10']
os: [ubuntu-latest, macos-latest, windows-latest]

steps:
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/cross-os-source-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,15 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Test with Tox
- name: Get dependencies
run: |
pip install --upgrade pip
pip install --upgrade tox tox-gh-actions
- name: Test - default config
run: |
tox -e test_api
- name: Test - alt config 1
run: |
tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini
2 changes: 1 addition & 1 deletion .github/workflows/doc-auto-notebook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ['3.8']
python-version: ['3.9']
os: [ubuntu-latest]

steps:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/doc-script-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.8', '3.9']
python-version: ['3.9']
os: [ubuntu-latest]

steps:
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/quick-source-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,19 @@ jobs:
restore-keys: |
${{ runner.os }}-${{ runner.python-version }}-tox-env-
- name: Test and make coverage report
- name: Get dependencies
run: |
pip install --upgrade pip
pip install --upgrade tox tox-gh-actions
- name: Test - default config
run: |
tox -e test_api
- name: Test - alt config 1
run: |
tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini
- name: Codacy Coverage Reporter
uses: codacy/codacy-coverage-reporter-action@master
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'push' }}
Expand Down
2 changes: 1 addition & 1 deletion docs/snippets/py/g0-4a-reduction-print.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
dataset.dfs["raw"].head(5)
dataset.dfs["raw"]().head(5)
2 changes: 1 addition & 1 deletion docs/snippets/py/t0-0a-dataset-text-print.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# each subset can be accessed as its own DataFrame
dataset.dfs["raw"].head(5)
dataset.dfs["raw"]().head(5)
2 changes: 1 addition & 1 deletion docs/snippets/py/t0-1a-vectorizer-print.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
text = dataset.dfs["raw"].loc[0, "text"]
text = dataset.dfs["raw"]().loc[0, "text"]
vec = vectorizer(text)
print(f"Text: {text}")
print(f"Vector shape: {vec.shape}")
2 changes: 1 addition & 1 deletion docs/snippets/py/t0-2a-reduction-print.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# what we did adds 'embed_2d_0' and 'embed_2d_1' columns to the DataFrames in dataset.dfs
dataset.dfs["raw"].head(5)
dataset.dfs["raw"]().head(5)
2 changes: 1 addition & 1 deletion docs/snippets/py/t3-2-dataset-selection-table.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
dataset._callback_update_selection(dataset.dfs["raw"].loc[:10])
dataset._callback_update_selection(dataset.dfs["raw"][:10])

show(dataset.sel_table, notebook_url=notebook_url)
93 changes: 57 additions & 36 deletions hover/__init__.py
Original file line number Diff line number Diff line change
@@ -1,124 +1,145 @@
"""
Module root where constants get configured.
"""
import re
from .config_constants import (
ConfigSection,
ConfigKey,
Validator,
Preprocessor,
)
from flexmod import AutolockedConfigValue, Config, ConfigIndex
from bokeh.palettes import Turbo256


config = ConfigIndex(
[
Config(
"io",
ConfigSection.IO,
[
AutolockedConfigValue(
"data_save_dir",
ConfigKey.DATA_SAVE_DIR,
"The directory path for saving labeled data.",
".",
validation=lambda x: isinstance(x, str),
validation=Validator.is_str,
),
],
),
Config(
ConfigSection.BACKEND,
[
AutolockedConfigValue(
ConfigKey.DATAFRAME_LIBRARY,
"The library to use for internal dataframes. Must be 'pandas' or 'polars'.",
"pandas",
validation=Validator.is_supported_dataframe_library,
preprocessor=Preprocessor.lower,
),
],
),
Config(
"visual",
ConfigSection.VISUAL,
[
AutolockedConfigValue(
"abstain_hexcolor",
ConfigKey.ABSTAIN_HEXCOLOR,
"Hex code of RGB color.",
"#dcdcdc",
validation=lambda x: bool(re.match(r"^\#[0-9a-fA-F]{6}$", x)),
validation=Validator.is_hex_color,
preprocessor=Preprocessor.lower,
),
AutolockedConfigValue(
"bokeh_palette",
ConfigKey.BOKEH_PALETTE,
"The bokeh color palette to use for plotting. This should be a list of hex color codes.",
Turbo256,
validation=lambda x: hasattr(x, "__iter__"),
validation=Validator.is_iterable_of_hex_color,
),
AutolockedConfigValue(
"bokeh_palette_usage",
ConfigKey.BOKEH_PALETTE_USAGE,
"Specify how colors from the palette should be chosen when there are fewer categories than colors. This needs to be 'iterate' or 'linspace'",
"linspace",
validation=lambda x: x in ["iterate", "linspace"],
validation=Validator.is_supported_traversal_mode,
preprocessor=Preprocessor.lower,
),
AutolockedConfigValue(
"table_img_style",
ConfigKey.TABLE_IMG_STYLE,
"HTML style of images shown in selection tables.",
"max-height: 100%; max-width: 100%; object-fit: contain",
preprocessor=lambda x: re.sub(r"(^[\'\"]|[\'\"]$)", "", x),
preprocessor=Preprocessor.remove_quote_at_ends,
),
AutolockedConfigValue(
"tooltip_img_style",
ConfigKey.TOOLTIP_IMG_STYLE,
"HTML style of images shown in mouse-over-data-point tooltips.",
"float: left; margin: 2px 2px 2px 2px; width: 60px; height: 60px;",
preprocessor=lambda x: re.sub(r"(^[\'\"]|[\'\"]$)", "", x),
preprocessor=Preprocessor.remove_quote_at_ends,
),
],
),
Config(
"data.embedding",
ConfigSection.DATA_EMBEDDING,
[
AutolockedConfigValue(
"default_reduction_method",
ConfigKey.DEFAULT_REDUCTION_METHOD,
"Default method for dimensionality reduction. Currently either 'umap' or 'ivis'.",
"umap",
validation=lambda x: x in ["umap", "ivis"],
validation=Validator.is_supported_dimensionality_reduction,
preprocessor=Preprocessor.lower,
),
],
),
Config(
"data.columns",
ConfigSection.DATA_COLUMNS,
[
AutolockedConfigValue(
"encoded_label_key",
ConfigKey.ENCODED_LABEL_KEY,
"The column name for the encoded label.",
"label_encoded",
validation=lambda x: isinstance(x, str),
validation=Validator.is_str,
),
AutolockedConfigValue(
"dataset_subset_field",
ConfigKey.DATASET_SUBSET_FIELD,
"The column name for dataset subsets.",
"SUBSET",
validation=lambda x: isinstance(x, str),
validation=Validator.is_str,
),
AutolockedConfigValue(
"embedding_field_prefix",
ConfigKey.EMBEDDING_FIELD_PREFIX,
"The prefix of column names for embedding coordinates.",
"embed_",
validation=lambda x: isinstance(x, str),
validation=Validator.is_str,
),
AutolockedConfigValue(
"source_color_field",
ConfigKey.SOURCE_COLOR_FIELD,
"The column name for plotted data point color.",
"__COLOR__",
validation=lambda x: isinstance(x, str),
validation=Validator.is_str,
),
AutolockedConfigValue(
"source_alpha_field",
ConfigKey.SOURCE_ALPHA_FIELD,
"The column name for plotted data point color alpha (opacity).",
"__ALPHA__",
validation=lambda x: isinstance(x, str),
validation=Validator.is_str,
),
AutolockedConfigValue(
"search_score_field",
ConfigKey.SEARCH_SCORE_FIELD,
"The column name for data points' score from search widgets.",
"__SEARCH_SCORE__",
validation=lambda x: isinstance(x, str),
validation=Validator.is_str,
),
],
),
Config(
"data.values",
ConfigSection.DATA_VALUES,
[
AutolockedConfigValue(
"abstain_decoded",
ConfigKey.ABSTAIN_DECODED,
"The placeholder label indicating 'no label yet'.",
"ABSTAIN",
validation=lambda x: isinstance(x, str),
validation=Validator.is_str,
),
AutolockedConfigValue(
"abstain_encoded",
ConfigKey.ABSTAIN_ENCODED,
"The encoded value of 'no label yet' which should almost always be -1, never 0 or positive.",
-1,
validation=lambda x: isinstance(x, int) and x < 0,
validation=Validator.is_negative_int,
),
],
),
Expand Down
78 changes: 78 additions & 0 deletions hover/config_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import re


class ConfigSection:
IO = "io"
BACKEND = "backend"
VISUAL = "visual"
DATA_EMBEDDING = "data.embedding"
DATA_COLUMNS = "data.columns"
DATA_VALUES = "data.values"


class ConfigKey:
DATA_SAVE_DIR = "data_save_dir"
DATAFRAME_LIBRARY = "dataframe_library"
ABSTAIN_HEXCOLOR = "abstain_hexcolor"
BOKEH_PALETTE = "bokeh_palette"
BOKEH_PALETTE_USAGE = "bokeh_palette_usage"
TABLE_IMG_STYLE = "table_img_style"
TOOLTIP_IMG_STYLE = "tooltip_img_style"
DEFAULT_REDUCTION_METHOD = "default_reduction_method"
ENCODED_LABEL_KEY = "encoded_label_key"
DATASET_SUBSET_FIELD = "dataset_subset_field"
EMBEDDING_FIELD_PREFIX = "embedding_field_prefix"
SOURCE_COLOR_FIELD = "source_color_field"
SOURCE_ALPHA_FIELD = "source_alpha_field"
SEARCH_SCORE_FIELD = "search_score_field"
ABSTAIN_DECODED = "abstain_decoded"
ABSTAIN_ENCODED = "abstain_encoded"


class Validator:
@staticmethod
def is_hex_color(x):
return bool(re.match(r"^\#[0-9a-fA-F]{6}$", x))

@staticmethod
def is_iterable(x):
return hasattr(x, "__iter__")

@staticmethod
def is_iterable_of_hex_color(x):
if not Validator.is_iterable(x):
return False
for i in x:
if not Validator.is_hex_color(i):
return False
return True

@staticmethod
def is_supported_dataframe_library(x):
return x in ["pandas", "polars"]

@staticmethod
def is_supported_dimensionality_reduction(x):
return x in ["umap", "ivis"]

@staticmethod
def is_supported_traversal_mode(x):
return x in ["iterate", "linspace"]

@staticmethod
def is_str(x):
return isinstance(x, str)

@staticmethod
def is_negative_int(x):
return isinstance(x, int) and x < 0


class Preprocessor:
@staticmethod
def remove_quote_at_ends(x):
return re.sub(r"(^[\'\"]|[\'\"]$)", "", x)

@staticmethod
def lower(x):
return x.lower()
Loading

0 comments on commit ff7a16e

Please sign in to comment.