From ff7a16e8e4729223533e63cfd9df71b89fe19050 Mon Sep 17 00:00:00 2001 From: Pavel <74003834+phurwicz@users.noreply.github.com> Date: Fri, 28 Apr 2023 16:37:58 -0400 Subject: [PATCH] Support polars as DataFrame engine (#67) and adjust all tests and doc scripts --- .github/workflows/cross-os-conda-build.yml | 2 +- .github/workflows/cross-os-install-source.yml | 2 +- .github/workflows/cross-os-source-test.yml | 9 +- .github/workflows/doc-auto-notebook.yml | 2 +- .github/workflows/doc-script-test.yml | 2 +- .github/workflows/quick-source-test.yml | 9 +- docs/snippets/py/g0-4a-reduction-print.txt | 2 +- docs/snippets/py/t0-0a-dataset-text-print.txt | 2 +- docs/snippets/py/t0-1a-vectorizer-print.txt | 2 +- docs/snippets/py/t0-2a-reduction-print.txt | 2 +- .../py/t3-2-dataset-selection-table.txt | 2 +- hover/__init__.py | 93 +-- hover/config_constants.py | 78 +++ hover/core/dataset.py | 211 ++++--- hover/core/explorer/base.py | 80 ++- hover/core/explorer/feature.py | 19 +- hover/core/explorer/functionality.py | 104 +-- hover/core/explorer/local_config.py | 12 +- hover/core/explorer/specialization.py | 30 +- hover/core/local_config.py | 13 +- hover/core/neural.py | 11 +- hover/core/representation/local_config.py | 10 + hover/core/representation/manifold.py | 4 +- hover/core/representation/reduction.py | 18 +- hover/module_config.py | 25 +- hover/recipes/local_config.py | 9 + hover/recipes/subroutine.py | 21 +- hover/utils/bokeh_helper/__init__.py | 12 +- hover/utils/bokeh_helper/local_config.py | 10 + hover/utils/dataframe.py | 590 ++++++++++++++++++ hover/utils/typecheck.py | 21 + setup.py | 4 +- tests/conftest.py | 78 ++- tests/core/explorer/local_helper.py | 6 +- tests/core/explorer/test_functionality.py | 23 +- tests/core/representation/test_reduction.py | 2 +- tests/core/test_dataset.py | 16 +- tests/core/test_neural.py | 3 +- tests/module_config/hover_alt_config_1.ini | 9 +- tests/recipes/local_helper.py | 9 +- tests/recipes/test_experimental.py | 10 +- tests/recipes/test_stable.py | 19 +- tests/utils/test_dataframe.py | 502 +++++++++++++++ tests/utils/test_snorkel_helper.py | 2 +- tests/utils/test_typecheck.py | 28 + 45 files changed, 1778 insertions(+), 340 deletions(-) create mode 100644 hover/config_constants.py create mode 100644 hover/recipes/local_config.py create mode 100644 hover/utils/dataframe.py create mode 100644 hover/utils/typecheck.py create mode 100644 tests/utils/test_dataframe.py create mode 100644 tests/utils/test_typecheck.py diff --git a/.github/workflows/cross-os-conda-build.yml b/.github/workflows/cross-os-conda-build.yml index a0d7bd21..ffb35a5f 100644 --- a/.github/workflows/cross-os-conda-build.yml +++ b/.github/workflows/cross-os-conda-build.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.8', '3.10'] os: [ubuntu-latest, macos-latest, windows-latest] steps: diff --git a/.github/workflows/cross-os-install-source.yml b/.github/workflows/cross-os-install-source.yml index f7e6c468..9df6f1ac 100644 --- a/.github/workflows/cross-os-install-source.yml +++ b/.github/workflows/cross-os-install-source.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.8', '3.10'] os: [ubuntu-latest, macos-latest, windows-latest] steps: diff --git a/.github/workflows/cross-os-source-test.yml b/.github/workflows/cross-os-source-test.yml index 82346997..7daf9ec6 100644 --- a/.github/workflows/cross-os-source-test.yml +++ b/.github/workflows/cross-os-source-test.yml @@ -25,8 +25,15 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Test with Tox + - name: Get dependencies run: | pip install --upgrade pip pip install --upgrade tox tox-gh-actions + + - name: Test - default config + run: | tox -e test_api + + - name: Test - alt config 1 + run: | + tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini diff --git a/.github/workflows/doc-auto-notebook.yml b/.github/workflows/doc-auto-notebook.yml index 9554ae02..4d174494 100644 --- a/.github/workflows/doc-auto-notebook.yml +++ b/.github/workflows/doc-auto-notebook.yml @@ -14,7 +14,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.8'] + python-version: ['3.9'] os: [ubuntu-latest] steps: diff --git a/.github/workflows/doc-script-test.yml b/.github/workflows/doc-script-test.yml index 96361caf..c305e1d5 100644 --- a/.github/workflows/doc-script-test.yml +++ b/.github/workflows/doc-script-test.yml @@ -31,7 +31,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.8', '3.9'] + python-version: ['3.9'] os: [ubuntu-latest] steps: diff --git a/.github/workflows/quick-source-test.yml b/.github/workflows/quick-source-test.yml index 23b2b7b9..777a5e7a 100644 --- a/.github/workflows/quick-source-test.yml +++ b/.github/workflows/quick-source-test.yml @@ -47,12 +47,19 @@ jobs: restore-keys: | ${{ runner.os }}-${{ runner.python-version }}-tox-env- - - name: Test and make coverage report + - name: Get dependencies run: | pip install --upgrade pip pip install --upgrade tox tox-gh-actions + + - name: Test - default config + run: | tox -e test_api + - name: Test - alt config 1 + run: | + tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini + - name: Codacy Coverage Reporter uses: codacy/codacy-coverage-reporter-action@master if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'push' }} diff --git a/docs/snippets/py/g0-4a-reduction-print.txt b/docs/snippets/py/g0-4a-reduction-print.txt index 3c22c070..bf4fd674 100644 --- a/docs/snippets/py/g0-4a-reduction-print.txt +++ b/docs/snippets/py/g0-4a-reduction-print.txt @@ -1 +1 @@ -dataset.dfs["raw"].head(5) +dataset.dfs["raw"]().head(5) diff --git a/docs/snippets/py/t0-0a-dataset-text-print.txt b/docs/snippets/py/t0-0a-dataset-text-print.txt index 2b561258..3fe85391 100644 --- a/docs/snippets/py/t0-0a-dataset-text-print.txt +++ b/docs/snippets/py/t0-0a-dataset-text-print.txt @@ -1,2 +1,2 @@ # each subset can be accessed as its own DataFrame -dataset.dfs["raw"].head(5) +dataset.dfs["raw"]().head(5) diff --git a/docs/snippets/py/t0-1a-vectorizer-print.txt b/docs/snippets/py/t0-1a-vectorizer-print.txt index 5cf611a9..1a4664e9 100644 --- a/docs/snippets/py/t0-1a-vectorizer-print.txt +++ b/docs/snippets/py/t0-1a-vectorizer-print.txt @@ -1,4 +1,4 @@ -text = dataset.dfs["raw"].loc[0, "text"] +text = dataset.dfs["raw"]().loc[0, "text"] vec = vectorizer(text) print(f"Text: {text}") print(f"Vector shape: {vec.shape}") diff --git a/docs/snippets/py/t0-2a-reduction-print.txt b/docs/snippets/py/t0-2a-reduction-print.txt index 3447b46a..d26644f6 100644 --- a/docs/snippets/py/t0-2a-reduction-print.txt +++ b/docs/snippets/py/t0-2a-reduction-print.txt @@ -1,2 +1,2 @@ # what we did adds 'embed_2d_0' and 'embed_2d_1' columns to the DataFrames in dataset.dfs -dataset.dfs["raw"].head(5) +dataset.dfs["raw"]().head(5) diff --git a/docs/snippets/py/t3-2-dataset-selection-table.txt b/docs/snippets/py/t3-2-dataset-selection-table.txt index 5f262797..13b7239c 100644 --- a/docs/snippets/py/t3-2-dataset-selection-table.txt +++ b/docs/snippets/py/t3-2-dataset-selection-table.txt @@ -1,3 +1,3 @@ -dataset._callback_update_selection(dataset.dfs["raw"].loc[:10]) +dataset._callback_update_selection(dataset.dfs["raw"][:10]) show(dataset.sel_table, notebook_url=notebook_url) diff --git a/hover/__init__.py b/hover/__init__.py index 0b3fd522..77ed7e0d 100644 --- a/hover/__init__.py +++ b/hover/__init__.py @@ -1,124 +1,145 @@ """ Module root where constants get configured. """ -import re +from .config_constants import ( + ConfigSection, + ConfigKey, + Validator, + Preprocessor, +) from flexmod import AutolockedConfigValue, Config, ConfigIndex from bokeh.palettes import Turbo256 + config = ConfigIndex( [ Config( - "io", + ConfigSection.IO, [ AutolockedConfigValue( - "data_save_dir", + ConfigKey.DATA_SAVE_DIR, "The directory path for saving labeled data.", ".", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, + ), + ], + ), + Config( + ConfigSection.BACKEND, + [ + AutolockedConfigValue( + ConfigKey.DATAFRAME_LIBRARY, + "The library to use for internal dataframes. Must be 'pandas' or 'polars'.", + "pandas", + validation=Validator.is_supported_dataframe_library, + preprocessor=Preprocessor.lower, ), ], ), Config( - "visual", + ConfigSection.VISUAL, [ AutolockedConfigValue( - "abstain_hexcolor", + ConfigKey.ABSTAIN_HEXCOLOR, "Hex code of RGB color.", "#dcdcdc", - validation=lambda x: bool(re.match(r"^\#[0-9a-fA-F]{6}$", x)), + validation=Validator.is_hex_color, + preprocessor=Preprocessor.lower, ), AutolockedConfigValue( - "bokeh_palette", + ConfigKey.BOKEH_PALETTE, "The bokeh color palette to use for plotting. This should be a list of hex color codes.", Turbo256, - validation=lambda x: hasattr(x, "__iter__"), + validation=Validator.is_iterable_of_hex_color, ), AutolockedConfigValue( - "bokeh_palette_usage", + ConfigKey.BOKEH_PALETTE_USAGE, "Specify how colors from the palette should be chosen when there are fewer categories than colors. This needs to be 'iterate' or 'linspace'", "linspace", - validation=lambda x: x in ["iterate", "linspace"], + validation=Validator.is_supported_traversal_mode, + preprocessor=Preprocessor.lower, ), AutolockedConfigValue( - "table_img_style", + ConfigKey.TABLE_IMG_STYLE, "HTML style of images shown in selection tables.", "max-height: 100%; max-width: 100%; object-fit: contain", - preprocessor=lambda x: re.sub(r"(^[\'\"]|[\'\"]$)", "", x), + preprocessor=Preprocessor.remove_quote_at_ends, ), AutolockedConfigValue( - "tooltip_img_style", + ConfigKey.TOOLTIP_IMG_STYLE, "HTML style of images shown in mouse-over-data-point tooltips.", "float: left; margin: 2px 2px 2px 2px; width: 60px; height: 60px;", - preprocessor=lambda x: re.sub(r"(^[\'\"]|[\'\"]$)", "", x), + preprocessor=Preprocessor.remove_quote_at_ends, ), ], ), Config( - "data.embedding", + ConfigSection.DATA_EMBEDDING, [ AutolockedConfigValue( - "default_reduction_method", + ConfigKey.DEFAULT_REDUCTION_METHOD, "Default method for dimensionality reduction. Currently either 'umap' or 'ivis'.", "umap", - validation=lambda x: x in ["umap", "ivis"], + validation=Validator.is_supported_dimensionality_reduction, + preprocessor=Preprocessor.lower, ), ], ), Config( - "data.columns", + ConfigSection.DATA_COLUMNS, [ AutolockedConfigValue( - "encoded_label_key", + ConfigKey.ENCODED_LABEL_KEY, "The column name for the encoded label.", "label_encoded", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "dataset_subset_field", + ConfigKey.DATASET_SUBSET_FIELD, "The column name for dataset subsets.", "SUBSET", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "embedding_field_prefix", + ConfigKey.EMBEDDING_FIELD_PREFIX, "The prefix of column names for embedding coordinates.", "embed_", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "source_color_field", + ConfigKey.SOURCE_COLOR_FIELD, "The column name for plotted data point color.", "__COLOR__", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "source_alpha_field", + ConfigKey.SOURCE_ALPHA_FIELD, "The column name for plotted data point color alpha (opacity).", "__ALPHA__", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "search_score_field", + ConfigKey.SEARCH_SCORE_FIELD, "The column name for data points' score from search widgets.", "__SEARCH_SCORE__", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), ], ), Config( - "data.values", + ConfigSection.DATA_VALUES, [ AutolockedConfigValue( - "abstain_decoded", + ConfigKey.ABSTAIN_DECODED, "The placeholder label indicating 'no label yet'.", "ABSTAIN", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "abstain_encoded", + ConfigKey.ABSTAIN_ENCODED, "The encoded value of 'no label yet' which should almost always be -1, never 0 or positive.", -1, - validation=lambda x: isinstance(x, int) and x < 0, + validation=Validator.is_negative_int, ), ], ), diff --git a/hover/config_constants.py b/hover/config_constants.py new file mode 100644 index 00000000..e7c80ace --- /dev/null +++ b/hover/config_constants.py @@ -0,0 +1,78 @@ +import re + + +class ConfigSection: + IO = "io" + BACKEND = "backend" + VISUAL = "visual" + DATA_EMBEDDING = "data.embedding" + DATA_COLUMNS = "data.columns" + DATA_VALUES = "data.values" + + +class ConfigKey: + DATA_SAVE_DIR = "data_save_dir" + DATAFRAME_LIBRARY = "dataframe_library" + ABSTAIN_HEXCOLOR = "abstain_hexcolor" + BOKEH_PALETTE = "bokeh_palette" + BOKEH_PALETTE_USAGE = "bokeh_palette_usage" + TABLE_IMG_STYLE = "table_img_style" + TOOLTIP_IMG_STYLE = "tooltip_img_style" + DEFAULT_REDUCTION_METHOD = "default_reduction_method" + ENCODED_LABEL_KEY = "encoded_label_key" + DATASET_SUBSET_FIELD = "dataset_subset_field" + EMBEDDING_FIELD_PREFIX = "embedding_field_prefix" + SOURCE_COLOR_FIELD = "source_color_field" + SOURCE_ALPHA_FIELD = "source_alpha_field" + SEARCH_SCORE_FIELD = "search_score_field" + ABSTAIN_DECODED = "abstain_decoded" + ABSTAIN_ENCODED = "abstain_encoded" + + +class Validator: + @staticmethod + def is_hex_color(x): + return bool(re.match(r"^\#[0-9a-fA-F]{6}$", x)) + + @staticmethod + def is_iterable(x): + return hasattr(x, "__iter__") + + @staticmethod + def is_iterable_of_hex_color(x): + if not Validator.is_iterable(x): + return False + for i in x: + if not Validator.is_hex_color(i): + return False + return True + + @staticmethod + def is_supported_dataframe_library(x): + return x in ["pandas", "polars"] + + @staticmethod + def is_supported_dimensionality_reduction(x): + return x in ["umap", "ivis"] + + @staticmethod + def is_supported_traversal_mode(x): + return x in ["iterate", "linspace"] + + @staticmethod + def is_str(x): + return isinstance(x, str) + + @staticmethod + def is_negative_int(x): + return isinstance(x, int) and x < 0 + + +class Preprocessor: + @staticmethod + def remove_quote_at_ends(x): + return re.sub(r"(^[\'\"]|[\'\"]$)", "", x) + + @staticmethod + def lower(x): + return x.lower() diff --git a/hover/core/dataset.py b/hover/core/dataset.py index b73c0b8d..79df357d 100644 --- a/hover/core/dataset.py +++ b/hover/core/dataset.py @@ -11,15 +11,20 @@ - loading data for training models """ import os -import hover +import operator import numpy as np -import pandas as pd from tqdm import tqdm from collections import OrderedDict -from hover import module_config +from hover.module_config import ( + DataFrame as DF, + ABSTAIN_DECODED, + ABSTAIN_ENCODED, + DATA_SAVE_DIR, +) from hover.core import Loggable from hover.utils.bokeh_helper import auto_label_color from hover.utils.misc import current_time +from hover.utils.typecheck import TypedValueDict from bokeh.models import ( Button, CheckboxGroup, @@ -35,6 +40,7 @@ dataset_default_sel_table_kwargs, COLOR_GLYPH_TEMPLATE, DATASET_SUBSET_FIELD, + DEFAULT_REDUCTION_METHOD, embedding_field, ) @@ -112,7 +118,7 @@ def setup_dfs( | `label_key` | `str` | the key for the `**str**` label in supervised data | """ - def dictl_transform(dictl, labels=True): + def dictl_transform(dictl, subset, labels=True): """ Burner function to transform the input list of dictionaries into standard format. """ @@ -135,31 +141,56 @@ def burner(d): trans_d = {key_transform.get(_k, _k): _v for _k, _v in d.items()} if not labels: - trans_d["label"] = module_config.ABSTAIN_DECODED + trans_d["label"] = ABSTAIN_DECODED + + trans_d[DATASET_SUBSET_FIELD] = subset return trans_d return [burner(_d) for _d in dictl] # standardize records - dictls = { - "raw": dictl_transform(raw_dictl, labels=False), - "train": dictl_transform(train_dictl), - "dev": dictl_transform(dev_dictl), - "test": dictl_transform(test_dictl), - } + dictls = [ + *dictl_transform(raw_dictl, "raw", labels=False), + *dictl_transform(train_dictl, "train"), + *dictl_transform(dev_dictl, "dev"), + *dictl_transform(test_dictl, "test"), + ] + all_subsets_df = DF.construct(dictls) + + assert all_subsets_df.shape[0] > 0, "Expected non-empty dataset" + assert ( + self.__class__.FEATURE_KEY in all_subsets_df.columns + ), f"Expected feature key {self.__class__.FEATURE_KEY}" + assert "label" in all_subsets_df.columns, "Expected label key 'label'" # initialize dataframes - self.dfs = dict() - for _key, _dictl in dictls.items(): - if _dictl: - _df = pd.DataFrame(_dictl) - assert self.__class__.FEATURE_KEY in _df.columns - assert "label" in _df.columns - else: - _df = pd.DataFrame(columns=[self.__class__.FEATURE_KEY, "label"]) + self.dfs = TypedValueDict(DF) + for _key in ["raw", "train", "dev", "test"]: + self.dfs[_key] = all_subsets_df.filter_rows_by_operator( + DATASET_SUBSET_FIELD, operator.eq, _key + ) - self.dfs[_key] = _df + @property + def dfs(self): + """ + ???+ note "Subset -> DataFrame mapping." + """ + return self._dfs + + @dfs.setter + def dfs(self, dfs): + assert isinstance( + dfs, TypedValueDict + ), f"Expected TypedValueDict, got {type(dfs)}" + assert not hasattr(self, "_dfs"), "Resetting `dfs` is forbidden." + self._dfs = dfs + + def subset(self, key): + """ + ???+ note "Return the DataFrame by reference for the given subset." + """ + return self.dfs[key] def copy(self): """ @@ -179,7 +210,7 @@ def compute_feature_index(self): """ feature_to_subset_idx = {} for _subset, _df in self.dfs.items(): - _values = _df[self.__class__.FEATURE_KEY].values + _values = DF.series_values(_df[self.__class__.FEATURE_KEY]) for i, _val in enumerate(_values): if _val in feature_to_subset_idx: raise ValueError( @@ -196,7 +227,9 @@ def locate_by_feature_value(self, value, auto_recompute=True): """ subset, index = self.feature_to_subset_idx[value] - current_value = self.dfs[subset].at[index, self.__class__.FEATURE_KEY] + current_value = self.dfs[subset].get_cell_by_row_column( + index, self.__class__.FEATURE_KEY + ) if current_value != value: if auto_recompute: self._warn("locate_by_feature_value mismatch. Recomputing index.") @@ -211,13 +244,7 @@ def to_pandas(self): """ ???+ note "Export to a pandas DataFrame." """ - dfs = [] - for _subset in ["raw", "train", "dev", "test"]: - _df = self.dfs[_subset].copy() - _df[DATASET_SUBSET_FIELD] = _subset - dfs.append(_df) - - return pd.concat(dfs, axis=0) + return DF.concat_rows(self.dfs.values()).to_pandas() @classmethod def from_pandas(cls, df, **kwargs): @@ -229,11 +256,11 @@ def from_pandas(cls, df, **kwargs): """ SUBSETS = cls.SCRATCH_SUBSETS + cls.PUBLIC_SUBSETS + cls.PRIVATE_SUBSETS - if DATASET_SUBSET_FIELD not in df.columns: - raise ValueError( - f"Expecting column '{DATASET_SUBSET_FIELD}' in the DataFrame which takes values from {SUBSETS}" - ) + assert ( + DATASET_SUBSET_FIELD in df.columns + ), f"Expecting column '{DATASET_SUBSET_FIELD}' in the DataFrame which takes values from {SUBSETS}" + # use the 'silly' approach for robustness dictls = {} for _subset in ["raw", "train", "dev", "test"]: _sub_df = df[df[DATASET_SUBSET_FIELD] == _subset] @@ -415,22 +442,18 @@ def callback_commit(event): ) return - sel_slice = self.dfs[sub_k].iloc[selected_idx] - valid_slice = sel_slice[ - sel_slice["label"] != module_config.ABSTAIN_DECODED - ] + sel_slice = self.dfs[sub_k].select_rows(selected_idx) + valid_slice = sel_slice.filter_rows_by_operator( + "label", operator.ne, ABSTAIN_DECODED + ) # concat to the end and do some accounting size_before = self.dfs[sub_to].shape[0] - self.dfs[sub_to] = pd.concat( - [self.dfs[sub_to], valid_slice], - axis=0, - sort=False, - ignore_index=True, - ) + self.dfs[sub_to] = DF.concat_rows([self.dfs[sub_to], valid_slice]) size_mid = self.dfs[sub_to].shape[0] - self.dfs[sub_to].drop_duplicates( - subset=[self.__class__.FEATURE_KEY], keep="last", inplace=True + self.dfs[sub_to] = self.dfs[sub_to].unique( + subset=[self.__class__.FEATURE_KEY], + keep="last", ) size_after = self.dfs[sub_to].shape[0] @@ -463,10 +486,10 @@ def callback_view(): sel_slices = [] for subset in subsets: selected_idx = sorted(explorer.sources[subset].selected.indices) - sub_slice = explorer.dfs[subset].iloc[selected_idx] + sub_slice = explorer.dfs[subset].select_rows(selected_idx) sel_slices.append(sub_slice) - selected = pd.concat(sel_slices, axis=0) + selected = DF.concat_rows(sel_slices) self._callback_update_selection(selected) def callback_view_refresh(): @@ -519,15 +542,15 @@ def setup_label_coding(self, verbose=True, debug=False): all_labels = set() for _key in [*self.__class__.PUBLIC_SUBSETS, *self.__class__.PRIVATE_SUBSETS]: _df = self.dfs[_key] - _found_labels = set(_df["label"].tolist()) + _found_labels = set(DF.series_tolist(_df["label"])) all_labels = all_labels.union(_found_labels) # exclude ABSTAIN from self.classes, but include it in the encoding - all_labels.discard(module_config.ABSTAIN_DECODED) + all_labels.discard(ABSTAIN_DECODED) self.classes = sorted(all_labels) self.label_encoder = { **{_label: _i for _i, _label in enumerate(self.classes)}, - module_config.ABSTAIN_DECODED: module_config.ABSTAIN_ENCODED, + ABSTAIN_DECODED: ABSTAIN_ENCODED, } self.label_decoder = {_v: _k for _k, _v in self.label_encoder.items()} @@ -549,14 +572,16 @@ def validate_labels(self, raise_exception=True): for _key in [*self.__class__.PUBLIC_SUBSETS, *self.__class__.PRIVATE_SUBSETS]: _invalid_indices = None assert "label" in self.dfs[_key].columns - _mask = self.dfs[_key]["label"].apply( - lambda x: int(x in self.label_encoder) + _mask = np.logical_not( + self.dfs[_key].column_isin( + "label", + self.label_encoder.keys(), + ) ) - # DO NOT change the "==" to "is"; False in pandas is not False below - _invalid_indices = np.where(_mask == 0)[0].tolist() + _invalid_indices = np.where(_mask)[0].tolist() if _invalid_indices: self._fail(f"Subset {_key} has invalid labels:") - self._print(self.dfs[_key].loc[_invalid_indices]) + self._print(self.dfs[_key].select_rows(_invalid_indices)()) if raise_exception: raise ValueError("invalid labels") @@ -579,7 +604,7 @@ def callback_export(event, path_root=None): # auto-determine the export path root if path_root is None: timestamp = current_time("%Y%m%d%H%M%S") - export_dir = module_config.DATA_SAVE_DIR + export_dir = DATA_SAVE_DIR path_root = os.path.join(export_dir, f"hover-dataset-{timestamp}") export_df = self.to_pandas() @@ -641,13 +666,13 @@ def update_population(): self.setup_label_coding() # re-compute label population - eff_labels = [module_config.ABSTAIN_DECODED, *self.classes] + eff_labels = [ABSTAIN_DECODED, *self.classes] color_dict = auto_label_color(self.classes) eff_colors = [color_dict[_label] for _label in eff_labels] pop_data = dict(color=eff_colors, label=eff_labels) for _subset in subsets: - _subpop = self.dfs[_subset]["label"].value_counts() + _subpop = self.dfs[_subset].column_counter("label") pop_data[f"count_{_subset}"] = [ _subpop.get(_label, 0) for _label in eff_labels ] @@ -686,7 +711,7 @@ def update_selection(selected_df): """ To be triggered as a subroutine of `self.selection_viewer`. """ - sel_source.data = selected_df.to_dict(orient="list") + sel_source.data = selected_df.to_dict_of_lists() # now that selection table has changed, clear sub-selection sel_source.selected.indices = [] @@ -703,7 +728,9 @@ def patch_edited_selection(): feature_value = sel_source.data[self.__class__.FEATURE_KEY][i] subset, idx = self.locate_by_feature_value(feature_value) for key in sel_source.data.keys(): - self.dfs[subset].at[idx, key] = sel_source.data[key][i] + self.dfs[subset].set_cell_by_row_column( + idx, key, sel_source.data[key][i] + ) self._good(f"Selection table: edited {len(raw_indices)} dataset rows.") # if edited labels (which is common), then population has changed @@ -731,21 +758,17 @@ def df_deduplicate(self): for _key in ordered_subsets: before[_key] = self.dfs[_key].shape[0] columns[_key] = self.dfs[_key].columns - self.dfs[_key]["__subset"] = _key + # update subset for rows that have been copied / moved + self.dfs[_key].set_column_by_constant(DATASET_SUBSET_FIELD, _key) # concatenate in order and deduplicate - overall_df = pd.concat( - [self.dfs[_key] for _key in ordered_subsets], axis=0, sort=False - ) - overall_df.drop_duplicates( - subset=[self.__class__.FEATURE_KEY], keep="last", inplace=True - ) - overall_df.reset_index(drop=True, inplace=True) + overall_df = DF.concat_rows([self.dfs[_key] for _key in ordered_subsets]) + overall_df = overall_df.unique(subset=[self.__class__.FEATURE_KEY], keep="last") # cut up slices for _key in ordered_subsets: - self.dfs[_key] = overall_df[overall_df["__subset"] == _key].reset_index( - drop=True, inplace=False + self.dfs[_key] = overall_df.filter_rows_by_operator( + DATASET_SUBSET_FIELD, operator.eq, _key )[columns[_key]] after[_key] = self.dfs[_key].shape[0] self._info(f"--subset {_key} rows: {before[_key]} -> {after[_key]}.") @@ -760,7 +783,9 @@ def vectorizer_lookup(self): def vectorizer_lookup(self, *args, **kwargs): self._fail("assigning vectorizer lookup by reference is forbidden.") - def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs): + def compute_nd_embedding( + self, vectorizer, method=DEFAULT_REDUCTION_METHOD, dimension=2, **kwargs + ): """ ???+ note "Get embeddings in n-dimensional space and return the dimensionality reducer." Reference: [`DimensionalityReducer`](https://github.com/phurwicz/hover/blob/main/hover/core/representation/reduction.py) @@ -774,9 +799,12 @@ def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs): """ from hover.core.representation.reduction import DimensionalityReducer - if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] # register the vectorizer for scenarios that may need it + assert ( + isinstance(dimension, int) and dimension >= 2 + ), "Invalid dimension {dimension}}" + if dimension in self.vectorizer_lookup: + self._warn(f"Overwriting embedding with dimension {dimension}.") self.vectorizer_lookup[dimension] = vectorizer # prepare input vectors to manifold learning @@ -784,16 +812,19 @@ def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs): trans_subset = [*self.__class__.PRIVATE_SUBSETS] assert not set(fit_subset).intersection(set(trans_subset)), "Unexpected overlap" - assert isinstance(dimension, int) and dimension >= 2 embedding_cols = [embedding_field(dimension, i) for i in range(dimension)] # compute vectors and keep track which where to slice the array for fitting feature_inp = [] for _key in fit_subset: - feature_inp.extend(self.dfs[_key][self.__class__.FEATURE_KEY].tolist()) + feature_inp.extend( + DF.series_tolist(self.dfs[_key][self.__class__.FEATURE_KEY]) + ) fit_num = len(feature_inp) for _key in trans_subset: - feature_inp.extend(self.dfs[_key][self.__class__.FEATURE_KEY].tolist()) + feature_inp.extend( + DF.series_tolist(self.dfs[_key][self.__class__.FEATURE_KEY]) + ) trans_arr = np.array( [vectorizer(_inp) for _inp in tqdm(feature_inp, desc="Vectorizing")] ) @@ -810,12 +841,12 @@ def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs): trans_embedding = reducer.transform(trans_arr[fit_num:], method) # assign x and y coordinates to dataset - start_idx = 0 for _subset, _embedding in [ (fit_subset, fit_embedding), (trans_subset, trans_embedding), ]: - # edge case: embedding is too small + start_idx = 0 + # edge case: embedding has no rows if _embedding.shape[0] < 1: for _key in _subset: assert ( @@ -824,17 +855,25 @@ def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs): continue for _key in _subset: _length = self.dfs[_key].shape[0] + _embedding_slice = _embedding[start_idx : (start_idx + _length), :] + assert ( + _length == _embedding_slice.shape[0] + ), f"Unexpected length {_length} vs {_embedding_slice.shape}; embedding total {_embedding.shape}" for _i in range(dimension): _col = embedding_cols[_i] - self.dfs[_key][_col] = pd.Series( - _embedding[start_idx : (start_idx + _length), _i] + self.dfs[_key].set_column_by_array( + _col, + _embedding_slice[:, _i], + indices=None, ) start_idx += _length self._good(f"Computed {dimension}-d embedding in columns {embedding_cols}") return reducer - def compute_2d_embedding(self, vectorizer, method=None, **kwargs): + def compute_2d_embedding( + self, vectorizer, method=DEFAULT_REDUCTION_METHOD, **kwargs + ): """ ???+ note "Get embeddings in the xy-plane and return the dimensionality reducer." A special case of `compute_nd_embedding`. @@ -846,7 +885,7 @@ def compute_2d_embedding(self, vectorizer, method=None, **kwargs): | `**kwargs` | | kwargs for `DimensionalityReducer` | """ reducer = self.compute_nd_embedding( - vectorizer, method=None, dimension=2, **kwargs + vectorizer, method=method, dimension=2, **kwargs ) return reducer @@ -869,7 +908,9 @@ def loader(self, key, *vectorizers, batch_size=64, smoothing_coeff=0.0): ) # take the slice that has a meaningful label - df = self.dfs[key][self.dfs[key]["label"] != module_config.ABSTAIN_DECODED] + df = self.dfs[key].filter_rows_by_operator( + "label", operator.ne, ABSTAIN_DECODED + ) # edge case: valid slice is too small if df.shape[0] < 1: @@ -877,7 +918,7 @@ def loader(self, key, *vectorizers, batch_size=64, smoothing_coeff=0.0): batch_size = min(batch_size, df.shape[0]) # prepare output vectors - labels = df["label"].apply(lambda x: self.label_encoder[x]).tolist() + labels = df.column_map("label", self.label_encoder, output="list") output_vectors = one_hot(labels, num_classes=len(self.classes)) if smoothing_coeff > 0.0: output_vectors = label_smoothing( @@ -887,7 +928,7 @@ def loader(self, key, *vectorizers, batch_size=64, smoothing_coeff=0.0): # prepare input vectors assert len(vectorizers) > 0, "Expected at least one vectorizer" multi_flag = len(vectorizers) > 1 - features = df[self.__class__.FEATURE_KEY].tolist() + features = DF.series_tolist(df[self.__class__.FEATURE_KEY]) input_vector_lists = [] for _vec_func in vectorizers: diff --git a/hover/core/explorer/base.py b/hover/core/explorer/base.py index 11843236..f8b5470f 100644 --- a/hover/core/explorer/base.py +++ b/hover/core/explorer/base.py @@ -1,7 +1,6 @@ """ ???+ note "Base class(es) for ALL explorer implementations." """ -import pandas as pd from abc import ABC, abstractmethod from collections import OrderedDict, defaultdict from bokeh.events import SelectionGeometry @@ -15,6 +14,8 @@ from hover.utils.bokeh_helper import bokeh_hover_tooltip from hover.utils.meta.traceback import RichTracebackABCMeta from hover.utils.misc import RootUnionFind +from hover.utils.typecheck import TypedValueDict +from hover.module_config import DataFrame from .local_config import SEARCH_SCORE_FIELD STANDARD_PLOT_TOOLS = [ @@ -51,7 +52,7 @@ class BokehBaseExplorer(Loggable, ABC, metaclass=RichTracebackABCMeta): SELECTION_PROCESSING_STAGES = ["save", "load", "write", "read"] PRIMARY_FEATURE = None - MANDATORY_COLUMNS = ["label"] + MANDATORY_COLUMN_TO_TYPE_DEFAULT = {"label": (str, None)} TOOLTIP_KWARGS = { "label": {"label": "Label"}, "coords": True, @@ -285,7 +286,7 @@ def adjust_slider(): col_patch in _df.columns ), f"Subset {_key} expecting column {col_patch} among columns, got {_df.columns}" # find all array lengths; note that the data subset can be empty - _num_patches_seen = _df[col_patch].apply(len).values + _num_patches_seen = _df.column_apply(col_patch, len, output="list") assert ( len(set(_num_patches_seen)) <= 1 ), f"Expecting consistent number of patches, got {_num_patches_seen}" @@ -309,7 +310,7 @@ def adjust_slider(): def update_patch(attr, old, new): for _key, _df in self.dfs.items(): # calculate the patch corresponding to slider value - _value = [_arr[new] for _arr in _df[col_patch].values] + _value = [_arr[new] for _arr in DataFrame.series_values(_df[col_patch])] _slice = slice(_df.shape[0]) _patch = {col_original: [(_slice, _value)]} self.sources[_key].patch(_patch) @@ -317,15 +318,36 @@ def update_patch(attr, old, new): slider.on_change("value", update_patch) self._good(f"Patching {col_original} using {col_patch}") - def _mandatory_column_defaults(self): + def _mandatory_column_info(self): """ - ???+ note "Mandatory columns and default values." + ???+ note "Mandatory columns, types, and default values." If default value is None, will raise exception if the column is not found. """ - return {_col: None for _col in self.__class__.MANDATORY_COLUMNS} + return { + _col: {"type": _type, "default": _default} + for _col, ( + _type, + _default, + ) in self.__class__.MANDATORY_COLUMN_TO_TYPE_DEFAULT.items() + } - def _setup_dfs(self, df_dict, copy=False): + @property + def dfs(self): + """ + ???+ note "Subset -> DataFrame mapping." + """ + return self._dfs + + @dfs.setter + def dfs(self, dfs): + assert isinstance( + dfs, TypedValueDict + ), f"Expected TypedValueDict, got {type(dfs)}" + assert not hasattr(self, "_dfs"), "Resetting `dfs` is forbidden." + self._dfs = dfs + + def _setup_dfs(self, df_dict): """ ???+ note "Check and store DataFrames **by reference by default**." Intended to be extended in child classes for pre/post processing. @@ -333,9 +355,10 @@ def _setup_dfs(self, df_dict, copy=False): | Param | Type | Description | | :---------- | :----- | :--------------------------- | | `df_dict` | `dict` | `str` -> `DataFrame` mapping | - | `copy` | `bool` | whether to copy `DataFrame`s | """ self._info("Setting up DataFrames") + for _df in df_dict.values(): + assert isinstance(_df, DataFrame), f"Expected DataFrame, got {type(_df)}" supplied_keys = set(df_dict.keys()) expected_keys = set(self.__class__.SUBSET_GLYPH_KWARGS.keys()) @@ -345,20 +368,21 @@ def _setup_dfs(self, df_dict, copy=False): expected_not_supplied = expected_keys.difference(supplied_keys) for _key in supplied_not_expected: - self._warn( - f"{self.__class__.__name__}.__init__(): got unexpected df key {_key}" - ) + self._warn(f"expected df keys {list(expected_keys)}, not {_key}") for _key in expected_not_supplied: - self._warn( - f"{self.__class__.__name__}.__init__(): missing expected df key {_key}" - ) + self._warn(f"expected df keys {list(expected_keys)}, missing {_key}") # assign df with column checks - self.dfs = dict() - mandatory_col_to_default = self._mandatory_column_defaults() + if not hasattr(self, "dfs"): + self.dfs = TypedValueDict(DataFrame) + else: + self.dfs.clear() + + mandatory_col_info = self._mandatory_column_info() for _key in expected_and_supplied: _df = df_dict[_key] - for _col, _default in mandatory_col_to_default.items(): + for _col, _dict in mandatory_col_info.items(): + _default = _dict["default"] # column exists: all good if _col in _df.columns: continue @@ -369,12 +393,14 @@ def _setup_dfs(self, df_dict, copy=False): assert _df.shape[0] == 0, _msg # default value available, will use it to create column else: - _df[_col] = _default - self.dfs[_key] = _df.copy() if copy else _df + _df.set_column_by_constant(_col, _default) + self.dfs[_key] = _df # expected dfs must be present for _key in expected_not_supplied: - _df = pd.DataFrame(columns=list(mandatory_col_to_default.keys())) + _df = DataFrame.empty_with_columns( + {col: _d["type"] for col, _d in mandatory_col_info.items()} + ) self.dfs[_key] = _df def _setup_sources(self): @@ -383,7 +409,10 @@ def _setup_sources(self): Intended to be extended in child classes for pre/post processing. """ self._info("Setting up sources") - self.sources = {_key: ColumnDataSource(_df) for _key, _df in self.dfs.items()} + self.sources = { + _key: ColumnDataSource(_df.to_dict_of_lists()) + for _key, _df in self.dfs.items() + } self._postprocess_sources() # initialize attributes that couple with sources @@ -536,7 +565,7 @@ def _update_sources(self): such as dynamic plotting kwargs, need to be re-assigned. """ for _key in self.dfs.keys(): - self.sources[_key].data = self.dfs[_key] + self.sources[_key].data = self.dfs[_key].to_dict_of_lists() self._postprocess_sources() # reset selections now that source indices may have changed @@ -845,6 +874,9 @@ def find_embedding_fields(self): else: # embedding columns must be the same across subsets assert embedding_cols == _emb_cols, "Inconsistent embedding columns" + assert ( + embedding_cols is not None + ), f"No embedding columns found: {[_df.columns for _df in self.dfs.values()]}" assert ( len(embedding_cols) >= 2 ), f"Expected at least two embedding columns, found {embedding_cols}" @@ -858,7 +890,7 @@ def auto_color_mapping(self): labels = set() for _key in self.dfs.keys(): - labels = labels.union(set(self.dfs[_key]["label"].values)) + labels = labels.union(set(DataFrame.series_values(self.dfs[_key]["label"]))) return auto_label_color(labels) diff --git a/hover/core/explorer/feature.py b/hover/core/explorer/feature.py index c61ce31e..ef6fca4c 100644 --- a/hover/core/explorer/feature.py +++ b/hover/core/explorer/feature.py @@ -2,11 +2,11 @@ ???+ note "Intermediate classes based on the main feature." """ import re -import hover import numpy as np from functools import lru_cache from bokeh.models import TextInput, Slider from .base import BokehBaseExplorer +from .local_config import TOOLTIP_IMG_STYLE class BokehForText(BokehBaseExplorer): @@ -23,7 +23,10 @@ class BokehForText(BokehBaseExplorer): """ PRIMARY_FEATURE = "text" - MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"] + MANDATORY_COLUMN_TO_TYPE_DEFAULT = { + PRIMARY_FEATURE: (str, None), + "label": (str, None), + } TOOLTIP_KWARGS = { "label": {"label": "Label"}, "text": {"text": "Text"}, @@ -174,7 +177,10 @@ class BokehForAudio(BokehForUrlToVector): """ PRIMARY_FEATURE = "audio" - MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"] + MANDATORY_COLUMN_TO_TYPE_DEFAULT = { + PRIMARY_FEATURE: (str, None), + "label": (str, None), + } TOOLTIP_KWARGS = { "label": {"label": "Label"}, "audio": {"audio": ""}, @@ -197,10 +203,13 @@ class BokehForImage(BokehForUrlToVector): """ PRIMARY_FEATURE = "image" - MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"] + MANDATORY_COLUMN_TO_TYPE_DEFAULT = { + PRIMARY_FEATURE: (str, None), + "label": (str, None), + } TOOLTIP_KWARGS = { "label": {"label": "Label"}, - "image": {"image": hover.config["visual"]["tooltip_img_style"]}, + "image": {"image": TOOLTIP_IMG_STYLE}, "coords": True, "index": True, } diff --git a/hover/core/explorer/functionality.py b/hover/core/explorer/functionality.py index 97131fa4..10ee314f 100644 --- a/hover/core/explorer/functionality.py +++ b/hover/core/explorer/functionality.py @@ -6,7 +6,10 @@ from bokeh.models import CDSView, IndexFilter, Dropdown, Button from bokeh.palettes import Category20 from bokeh.layouts import row -from hover import module_config +from hover.module_config import ( + DataFrame as DF, + ABSTAIN_DECODED, +) from hover.utils.misc import current_time from hover.utils.bokeh_helper import bokeh_hover_tooltip from .local_config import SOURCE_COLOR_FIELD, SOURCE_ALPHA_FIELD, SEARCH_SCORE_FIELD @@ -130,11 +133,7 @@ def _postprocess_sources(self): color_dict = self.auto_color_mapping() for _key, _df in self.dfs.items(): - _color = ( - _df["label"] - .apply(lambda label: color_dict.get(label, "gainsboro")) - .tolist() - ) + _color = _df.column_map("label", color_dict, output="list") self.sources[_key].add(_color, SOURCE_COLOR_FIELD) def _update_colors(self): @@ -146,11 +145,7 @@ def _update_colors(self): # infer glyph colors dynamically color_dict = self.auto_color_mapping() - color_list = ( - self.dfs["raw"]["label"] - .apply(lambda label: color_dict.get(label, "gainsboro")) - .tolist() - ) + color_list = self.dfs["raw"].column_map("label", color_dict, output="list") self.sources["raw"].patch( {SOURCE_COLOR_FIELD: [(slice(len(color_list)), color_list)]} ) @@ -188,7 +183,11 @@ def callback_apply(): self._info(f"applying {len(selected_idx)} annotations...") # update label in both the df and the data source - self.dfs["raw"].loc[selected_idx, "label"] = label + self.dfs["raw"].set_column_by_constant( + "label", + label, + indices=selected_idx, + ) patch_to_apply = [(_idx, label) for _idx in selected_idx] self.sources["raw"].patch({"label": patch_to_apply}) self._good(f"applied {len(selected_idx)} annotations: {label}") @@ -272,20 +271,20 @@ def _build_tooltip(self, specified): ) return specified - def _mandatory_column_defaults(self): + def _mandatory_column_info(self): """ - ???+ note "Mandatory columns and default values." + ???+ note "Mandatory columns, types, and default values." If default value is None, will raise exception if the column is not found. """ - column_to_value = super()._mandatory_column_defaults() - column_to_value.update( + column_info = super()._mandatory_column_info() + column_info.update( { - self.label_col: module_config.ABSTAIN_DECODED, - self.score_col: 0.5, + self.label_col: {"type": str, "default": ABSTAIN_DECODED}, + self.score_col: {"type": float, "default": 0.5}, } ) - return column_to_value + return column_info def _postprocess_sources(self): """ @@ -294,12 +293,9 @@ def _postprocess_sources(self): # infer glyph color from labels color_dict = self.auto_color_mapping() - def get_color(label): - return color_dict.get(label, "gainsboro") - # infer glyph alpha from pseudo-percentile of soft label scores scores = np.concatenate( - [_df[self.score_col].tolist() for _df in self.dfs.values()] + [DF.series_tolist(_df[self.score_col]) for _df in self.dfs.values()] ) scores_mean = scores.mean() scores_std = scores.std() + 1e-4 @@ -314,8 +310,8 @@ def pseudo_percentile(confidence, lower=0.1, upper=0.9): # infer alpha from score percentiles for _key, _df in self.dfs.items(): - _color = _df[self.label_col].apply(get_color).tolist() - _alpha = _df[self.score_col].apply(pseudo_percentile).tolist() + _color = _df.column_map(self.label_col, color_dict, output="list") + _alpha = _df.column_apply(self.score_col, pseudo_percentile, output="list") self.sources[_key].add(_color, SOURCE_COLOR_FIELD) self.sources[_key].add(_alpha, SOURCE_ALPHA_FIELD) @@ -346,6 +342,7 @@ def subroutine(df, lower, upper): """ Calculate indices with score between lower/upper bounds. """ + # note: comparing series with scalar is the same in pandas/polars keep_l = set(np.where(df[self.score_col] >= lower)[0]) keep_u = set(np.where(df[self.score_col] <= upper)[0]) kept = keep_l.intersection(keep_u) @@ -444,17 +441,17 @@ def __init__(self, df_dict, label_col_a, label_col_b, **kwargs): self.label_col_b = label_col_b super().__init__(df_dict, **kwargs) - def _mandatory_column_defaults(self): + def _mandatory_column_info(self): """ ???+ note "Mandatory columns and default values." If default value is None, will raise exception if the column is not found. """ - column_to_value = super()._mandatory_column_defaults() + column_to_value = super()._mandatory_column_info() column_to_value.update( { - self.label_col_a: None, - self.label_col_b: None, + self.label_col_a: {"type": str, "default": None}, + self.label_col_b: {"type": str, "default": None}, } ) return column_to_value @@ -476,10 +473,13 @@ def plot(self, label, **kwargs): eff_kwargs["legend_label"] = f"{label}" # create agreement/increment/decrement subsets - col_a_pos = np.where(self.dfs[_key][self.label_col_a] == label)[0].tolist() - col_a_neg = np.where(self.dfs[_key][self.label_col_a] != label)[0].tolist() - col_b_pos = np.where(self.dfs[_key][self.label_col_b] == label)[0].tolist() - col_b_neg = np.where(self.dfs[_key][self.label_col_b] != label)[0].tolist() + # note: comparing series with constant is the same in pandas/polars + mask_a = DF.series_values(self.dfs[_key][self.label_col_a] == label) + mask_b = DF.series_values(self.dfs[_key][self.label_col_b] == label) + col_a_pos = np.where(mask_a)[0].tolist() + col_a_neg = np.where(np.logical_not(mask_a))[0].tolist() + col_b_pos = np.where(mask_b)[0].tolist() + col_b_neg = np.where(np.logical_not(mask_b))[0].tolist() agreement_view = CDSView( source=_source, filters=[IndexFilter(col_a_pos), IndexFilter(col_b_pos)] ) @@ -653,13 +653,11 @@ def callback_apply(event): ) return - labels = self.dfs["raw"].iloc[selected_idx].apply(lf, axis=1).values - num_nontrivial = len( - list(filter(lambda l: l != module_config.ABSTAIN_DECODED, labels)) - ) + labels = self.dfs["raw"].row_apply(lf, indices=selected_idx, output="numpy") + num_nontrivial = len(list(filter(lambda l: l != ABSTAIN_DECODED, labels))) # update label in both the df and the data source - self.dfs["raw"].loc[selected_idx, "label"] = labels + self.dfs["raw"].set_column_by_array("label", labels, indices=selected_idx) for _idx, _label in zip(selected_idx, labels): _idx = int(_idx) self.sources["raw"].patch({"label": [(_idx, _label)]}) @@ -692,11 +690,13 @@ def callback_filter(event): for _key, _source in self.sources.items(): _selected = _source.selected.indices - _labels = self.dfs[_key].iloc[_selected].apply(lf, axis=1).values + _labels = self.dfs[_key].row_apply( + lf, indices=_selected, output="numpy" + ) _kept = [ _idx for _idx, _label in zip(_selected, _labels) - if _label != module_config.ABSTAIN_DECODED + if _label != ABSTAIN_DECODED ] self.sources[_key].selected.indices = _kept @@ -798,8 +798,8 @@ def refresh_glyphs(self, lf_name): assert lf_name in self.lf_data, f"trying to refresh non-existing LF: {lf_name}" lf = self.lf_data[lf_name]["lf"] - L_raw = self.dfs["raw"].apply(lf, axis=1).values - L_labeled = self.dfs["labeled"].apply(lf, axis=1).values + L_raw = self.dfs["raw"].row_apply(lf, output="numpy") + L_labeled = self.dfs["labeled"].row_apply(lf, output="numpy") glyph_codes = self.lf_data[lf_name]["glyphs"].keys() if "C" in glyph_codes: @@ -841,9 +841,9 @@ def plot_new_lf( # calculate predicted labels if not provided if L_raw is None: - L_raw = self.dfs["raw"].apply(lf, axis=1).values + L_raw = self.dfs["raw"].row_apply(lf, output="numpy") if L_labeled is None: - L_labeled = self.dfs["labeled"].apply(lf, axis=1).values + L_labeled = self.dfs["labeled"].row_apply(lf, output="numpy") # prepare plot settings assert self.palette, f"Palette depleted, # LFs: {len(self.lf_data)}" @@ -923,8 +923,8 @@ def _view_correct(self, L_labeled): if L_labeled.shape[0] == 0: indices = [] else: - agreed = self.dfs["labeled"]["label"].values == L_labeled - attempted = L_labeled != module_config.ABSTAIN_DECODED + agreed = DF.series_values(self.dfs["labeled"]["label"]) == L_labeled + attempted = L_labeled != ABSTAIN_DECODED indices = np.where(np.multiply(agreed, attempted))[0].tolist() view = CDSView(source=self.sources["labeled"], filters=[IndexFilter(indices)]) return view @@ -939,8 +939,8 @@ def _view_incorrect(self, L_labeled): if L_labeled.shape[0] == 0: indices = [] else: - disagreed = self.dfs["labeled"]["label"].values != L_labeled - attempted = L_labeled != module_config.ABSTAIN_DECODED + disagreed = DF.series_values(self.dfs["labeled"]["label"]) != L_labeled + attempted = L_labeled != ABSTAIN_DECODED indices = np.where(np.multiply(disagreed, attempted))[0].tolist() view = CDSView(source=self.sources["labeled"], filters=[IndexFilter(indices)]) return view @@ -956,8 +956,10 @@ def _view_missed(self, L_labeled, targets): if L_labeled.shape[0] == 0: indices = [] else: - targetable = np.isin(self.dfs["labeled"]["label"], targets) - abstained = L_labeled == module_config.ABSTAIN_DECODED + targetable = np.isin( + DF.series_values(self.dfs["labeled"]["label"]), targets + ) + abstained = L_labeled == ABSTAIN_DECODED indices = np.where(np.multiply(targetable, abstained))[0].tolist() view = CDSView(source=self.sources["labeled"], filters=[IndexFilter(indices)]) return view @@ -972,6 +974,6 @@ def _view_hit(self, L_raw): if L_raw.shape[0] == 0: indices = [] else: - indices = np.where(L_raw != module_config.ABSTAIN_DECODED)[0].tolist() + indices = np.where(L_raw != ABSTAIN_DECODED)[0].tolist() view = CDSView(source=self.sources["raw"], filters=[IndexFilter(indices)]) return view diff --git a/hover/core/explorer/local_config.py b/hover/core/explorer/local_config.py index 16e7c7d1..50b639df 100644 --- a/hover/core/explorer/local_config.py +++ b/hover/core/explorer/local_config.py @@ -1,5 +1,11 @@ import hover +from hover.config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) -SOURCE_COLOR_FIELD = hover.config["data.columns"]["source_color_field"] -SOURCE_ALPHA_FIELD = hover.config["data.columns"]["source_alpha_field"] -SEARCH_SCORE_FIELD = hover.config["data.columns"]["search_score_field"] +SOURCE_COLOR_FIELD = hover.config[Section.DATA_COLUMNS][Key.SOURCE_COLOR_FIELD] +SOURCE_ALPHA_FIELD = hover.config[Section.DATA_COLUMNS][Key.SOURCE_ALPHA_FIELD] +SEARCH_SCORE_FIELD = hover.config[Section.DATA_COLUMNS][Key.SEARCH_SCORE_FIELD] + +TOOLTIP_IMG_STYLE = hover.config[Section.VISUAL][Key.TOOLTIP_IMG_STYLE] diff --git a/hover/core/explorer/specialization.py b/hover/core/explorer/specialization.py index f2c67878..63ec0812 100644 --- a/hover/core/explorer/specialization.py +++ b/hover/core/explorer/specialization.py @@ -19,7 +19,7 @@ class BokehTextFinder(BokehDataFinder, BokehForText): """ TOOLTIP_KWARGS = BokehForText.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForText.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForText.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataFinder.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -42,7 +42,7 @@ class BokehTextAnnotator(BokehDataAnnotator, BokehForText): """ TOOLTIP_KWARGS = BokehForText.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForText.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForText.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataAnnotator.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -63,7 +63,7 @@ class BokehTextSoftLabel(BokehSoftLabelExplorer, BokehForText): """ TOOLTIP_KWARGS = BokehForText.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForText.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForText.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSoftLabelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -84,7 +84,7 @@ class BokehTextMargin(BokehMarginExplorer, BokehForText): """ TOOLTIP_KWARGS = BokehForText.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForText.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForText.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehMarginExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -104,7 +104,7 @@ class BokehTextSnorkel(BokehSnorkelExplorer, BokehForText): """ TOOLTIP_KWARGS = BokehForText.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForText.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForText.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSnorkelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -125,7 +125,7 @@ class BokehAudioFinder(BokehDataFinder, BokehForAudio): """ TOOLTIP_KWARGS = BokehForAudio.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForAudio.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForAudio.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataFinder.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -148,7 +148,7 @@ class BokehAudioAnnotator(BokehDataAnnotator, BokehForAudio): """ TOOLTIP_KWARGS = BokehForAudio.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForAudio.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForAudio.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataAnnotator.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -169,7 +169,7 @@ class BokehAudioSoftLabel(BokehSoftLabelExplorer, BokehForAudio): """ TOOLTIP_KWARGS = BokehForAudio.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForAudio.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForAudio.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSoftLabelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -190,7 +190,7 @@ class BokehAudioMargin(BokehMarginExplorer, BokehForAudio): """ TOOLTIP_KWARGS = BokehForAudio.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForAudio.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForAudio.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehMarginExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -210,7 +210,7 @@ class BokehAudioSnorkel(BokehSnorkelExplorer, BokehForAudio): """ TOOLTIP_KWARGS = BokehForAudio.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForAudio.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForAudio.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSnorkelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -231,7 +231,7 @@ class BokehImageFinder(BokehDataFinder, BokehForImage): """ TOOLTIP_KWARGS = BokehForImage.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForImage.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForImage.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataFinder.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -254,7 +254,7 @@ class BokehImageAnnotator(BokehDataAnnotator, BokehForImage): """ TOOLTIP_KWARGS = BokehForImage.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForImage.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForImage.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataAnnotator.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -275,7 +275,7 @@ class BokehImageSoftLabel(BokehSoftLabelExplorer, BokehForImage): """ TOOLTIP_KWARGS = BokehForImage.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForImage.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForImage.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSoftLabelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -296,7 +296,7 @@ class BokehImageMargin(BokehMarginExplorer, BokehForImage): """ TOOLTIP_KWARGS = BokehForImage.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForImage.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForImage.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehMarginExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -316,7 +316,7 @@ class BokehImageSnorkel(BokehSnorkelExplorer, BokehForImage): """ TOOLTIP_KWARGS = BokehForImage.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForImage.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForImage.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSnorkelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): diff --git a/hover/core/local_config.py b/hover/core/local_config.py index 49a56ddb..0787ad61 100644 --- a/hover/core/local_config.py +++ b/hover/core/local_config.py @@ -1,5 +1,9 @@ import re import hover +from hover.config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) from bokeh.models import ( Div, TableColumn, @@ -8,7 +12,10 @@ ) -DATASET_SUBSET_FIELD = hover.config["data.columns"]["dataset_subset_field"] +DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][ + Key.DEFAULT_REDUCTION_METHOD +] +DATASET_SUBSET_FIELD = hover.config[Section.DATA_COLUMNS][Key.DATASET_SUBSET_FIELD] COLOR_GLYPH_TEMPLATE = """
@@ -16,7 +23,7 @@
""" -EMBEDDING_FIELD_PREFIX = hover.config["data.columns"]["embedding_field_prefix"] +EMBEDDING_FIELD_PREFIX = hover.config[Section.DATA_COLUMNS][Key.EMBEDDING_FIELD_PREFIX] EMBEDDING_FIELD_REGEX = r"\d+d_\d+$" @@ -53,7 +60,7 @@ def dataset_default_sel_table_columns(feature_key): template="""<%= value %>""" ) elif feature_key == "image": - style = hover.config["visual"]["table_img_style"] + style = hover.config[Section.VISUAL][Key.TABLE_IMG_STYLE] # width is easily adjustable on the UI, no need to make configurable here feature_col_kwargs["width"] = 200 feature_col_kwargs["formatter"] = HTMLTemplateFormatter( diff --git a/hover/core/neural.py b/hover/core/neural.py index 0eafbe83..44e5bbfa 100644 --- a/hover/core/neural.py +++ b/hover/core/neural.py @@ -4,7 +4,6 @@ `torch`-based template classes for implementing neural nets that work the most smoothly with `hover`. """ import os -import hover import numpy as np import torch import torch.nn.functional as F @@ -15,6 +14,7 @@ from hover.core import Loggable from hover.utils.metrics import classification_accuracy from hover.utils.misc import current_time +from .local_config import DEFAULT_REDUCTION_METHOD class BaseVectorNet(Loggable): @@ -325,7 +325,11 @@ def predict_proba(self, inps): return probs def manifold_trajectory( - self, inps, method=None, reducer_kwargs=None, spline_kwargs=None + self, + inps, + method=DEFAULT_REDUCTION_METHOD, + reducer_kwargs=None, + spline_kwargs=None, ): """ ???+ note "Compute a propagation trajectory of the dataset manifold through the neural net." @@ -346,9 +350,6 @@ def manifold_trajectory( from hover.core.representation.manifold import LayerwiseManifold from hover.core.representation.trajectory import manifold_spline - if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] - reducer_kwargs = reducer_kwargs or {} spline_kwargs = spline_kwargs or {} diff --git a/hover/core/representation/local_config.py b/hover/core/representation/local_config.py index 15ba1b5f..93be1e0b 100644 --- a/hover/core/representation/local_config.py +++ b/hover/core/representation/local_config.py @@ -1,6 +1,16 @@ +import hover +from hover.config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) + KWARG_TRANSLATOR = { "dimension": { "umap": "n_components", "ivis": "embedding_dims", }, } + +DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][ + Key.DEFAULT_REDUCTION_METHOD +] diff --git a/hover/core/representation/manifold.py b/hover/core/representation/manifold.py index a08f1bdf..cea3462c 100644 --- a/hover/core/representation/manifold.py +++ b/hover/core/representation/manifold.py @@ -2,11 +2,11 @@ Manifold similarity measures for any collection of sequences of vectors. Can be useful for improved interpretability of neural nets. """ -import hover from tqdm import tqdm from scipy.spatial import procrustes from hover.core import Loggable from .reduction import DimensionalityReducer +from .local_config import DEFAULT_REDUCTION_METHOD class LayerwiseManifold(Loggable): @@ -64,7 +64,7 @@ def unfold(self, method=None, **kwargs): :type method: str """ if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] + method = DEFAULT_REDUCTION_METHOD # default kwargs should fix random state and seed # so that randomness does not introduce disparity diff --git a/hover/core/representation/reduction.py b/hover/core/representation/reduction.py index 7d1655fd..11909312 100644 --- a/hover/core/representation/reduction.py +++ b/hover/core/representation/reduction.py @@ -5,10 +5,9 @@ Icing on the cake: unify the syntax across different kinds of reducers. """ -import hover import numpy as np from hover.core import Loggable -from .local_config import KWARG_TRANSLATOR +from .local_config import KWARG_TRANSLATOR, DEFAULT_REDUCTION_METHOD class DimensionalityReducer(Loggable): @@ -22,7 +21,7 @@ def __init__(self, array): self.reference_array = array @staticmethod - def create_reducer(method=None, *args, **kwargs): + def create_reducer(method=DEFAULT_REDUCTION_METHOD, *args, **kwargs): """ ???+ note "Handle kwarg translation and dynamic imports." @@ -32,9 +31,6 @@ def create_reducer(method=None, *args, **kwargs): | `*args` | | forwarded to the reducer | | `**kwargs` | | translated and forwarded | """ - if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] - if method == "umap": import umap @@ -57,7 +53,7 @@ def create_reducer(method=None, *args, **kwargs): reducer = reducer_cls(*args, **translated_kwargs) return reducer - def fit_transform(self, method=None, *args, **kwargs): + def fit_transform(self, method=DEFAULT_REDUCTION_METHOD, *args, **kwargs): """ ???+ note "Fit and transform an array and store the reducer." | Param | Type | Description | @@ -66,15 +62,12 @@ def fit_transform(self, method=None, *args, **kwargs): | `*args` | | forwarded to the reducer | | `**kwargs` | | forwarded to the reducer | """ - if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] - reducer = DimensionalityReducer.create_reducer(method=method, *args, **kwargs) embedding = reducer.fit_transform(self.reference_array) setattr(self, method, reducer) return embedding - def transform(self, array, method=None): + def transform(self, array, method=DEFAULT_REDUCTION_METHOD): """ ???+ note "Transform an array with a already-fitted reducer." | Param | Type | Description | @@ -82,9 +75,6 @@ def transform(self, array, method=None): | `array` | `np.ndarray` | the array to transform | | `method` | `str` | `"umap"` or `"ivis"` | """ - if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] - assert isinstance(array, np.ndarray), f"Expected np.ndarray, got {type(array)}" # edge case: array is too small if array.shape[0] < 1: diff --git a/hover/module_config.py b/hover/module_config.py index dff6c60b..6cc32e17 100644 --- a/hover/module_config.py +++ b/hover/module_config.py @@ -1,12 +1,27 @@ import hover +from .config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) +from .utils.dataframe import ( + PandasDataframe, + PolarsDataframe, +) + +# dataframe implementation +DataFrame = ( + PandasDataframe + if hover.config[Section.BACKEND][Key.DATAFRAME_LIBRARY].lower() == "pandas" + else PolarsDataframe +) # constants for the abstain mechanism -ABSTAIN_DECODED = hover.config["data.values"]["abstain_decoded"] -ABSTAIN_ENCODED = hover.config["data.values"]["abstain_encoded"] -ABSTAIN_HEXCOLOR = hover.config["visual"]["abstain_hexcolor"] +ABSTAIN_DECODED = hover.config[Section.DATA_VALUES][Key.ABSTAIN_DECODED] +ABSTAIN_ENCODED = hover.config[Section.DATA_VALUES][Key.ABSTAIN_ENCODED] +ABSTAIN_HEXCOLOR = hover.config[Section.VISUAL][Key.ABSTAIN_HEXCOLOR] # constants for label encoding mechanism -ENCODED_LABEL_KEY = hover.config["data.columns"]["encoded_label_key"] +ENCODED_LABEL_KEY = hover.config[Section.DATA_COLUMNS][Key.ENCODED_LABEL_KEY] # constants for saving work -DATA_SAVE_DIR = hover.config["io"]["data_save_dir"] +DATA_SAVE_DIR = hover.config[Section.IO][Key.DATA_SAVE_DIR] diff --git a/hover/recipes/local_config.py b/hover/recipes/local_config.py new file mode 100644 index 00000000..7e3b5eb7 --- /dev/null +++ b/hover/recipes/local_config.py @@ -0,0 +1,9 @@ +import hover +from hover.config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) + +DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][ + Key.DEFAULT_REDUCTION_METHOD +] diff --git a/hover/recipes/subroutine.py b/hover/recipes/subroutine.py index 1f88b555..30213503 100644 --- a/hover/recipes/subroutine.py +++ b/hover/recipes/subroutine.py @@ -7,11 +7,12 @@ """ import re import numpy as np -import hover import hover.core.explorer as hovex +from hover.module_config import DataFrame as DF from bokeh.layouts import row, column from bokeh.models import Button from rich.console import Console +from .local_config import DEFAULT_REDUCTION_METHOD EXPLORER_CATALOG = { @@ -112,7 +113,7 @@ def standard_annotator(dataset, **kwargs): annotator.activate_search() annotator.plot() - # subscribe for df updates + # subscribe for dataset updates dataset.subscribe_update_push(annotator, {_k: _k for _k in subsets}) # annotators can commit to a dataset @@ -152,7 +153,7 @@ def standard_finder(dataset, **kwargs): finder.activate_search() finder.plot() - # subscribe for df updates + # subscribe for dataset updates dataset.subscribe_update_push(finder, {_k: _k for _k in subsets}) return finder @@ -269,7 +270,7 @@ def retrain_vecnet(): vecnet.auto_adjust_setup(dataset.classes) train_loader = vecnet.prepare_loader(dataset, "train", smoothing_coeff=0.2) - if dataset.dfs["dev"].shape[0] > 0: + if dataset.subset("dev").shape[0] > 0: dev_loader = vecnet.prepare_loader(dataset, "dev") else: dataset._warn("dev set is empty, borrowing train set for validation.") @@ -287,33 +288,33 @@ def update_softlabel_plot(): use_subsets = ("raw", "train", "dev") inps = [] for _key in use_subsets: - inps.extend(dataset.dfs[_key][feature_key].tolist()) + inps.extend(DF.series_tolist(dataset.subset(_key)[feature_key])) probs = vecnet.predict_proba(inps) labels = [dataset.label_decoder[_val] for _val in probs.argmax(axis=-1)] scores = probs.max(axis=-1).tolist() traj_arr, _, _ = vecnet.manifold_trajectory( inps, - method=hover.config["data.embedding"]["default_reduction_method"], + method=DEFAULT_REDUCTION_METHOD, reducer_kwargs=dict(dimension=manifold_dim), spline_kwargs=dict(points_per_step=5), ) offset = 0 for _key in use_subsets: - _length = dataset.dfs[_key].shape[0] + _length = dataset.subset(_key).shape[0] # skip subset if empty if _length == 0: continue _slice = slice(offset, offset + _length) - dataset.dfs[_key]["pred_label"] = labels[_slice] - dataset.dfs[_key]["pred_score"] = scores[_slice] + dataset.subset(_key).set_column_by_array("pred_label", labels[_slice]) + dataset.subset(_key).set_column_by_array("pred_score", scores[_slice]) for i, _col in enumerate(manifold_traj_cols): # all steps, selected slice _traj = traj_arr[:, _slice, i] # selected slice, all steps _traj = list(np.swapaxes(_traj, 0, 1)) - dataset.dfs[_key][f"{_col}_traj"] = _traj + dataset.subset(_key).set_column_by_array(f"{_col}_traj", _traj) offset += _length diff --git a/hover/utils/bokeh_helper/__init__.py b/hover/utils/bokeh_helper/__init__.py index b2c2b497..c688888f 100644 --- a/hover/utils/bokeh_helper/__init__.py +++ b/hover/utils/bokeh_helper/__init__.py @@ -2,14 +2,13 @@ ???+ note "Useful subroutines for working with bokeh in general." """ import os -import hover import numpy as np from functools import wraps from traceback import format_exc from urllib.parse import urljoin, urlparse from bokeh.models import PreText from bokeh.layouts import column -from hover import module_config +from hover.module_config import ABSTAIN_DECODED, ABSTAIN_HEXCOLOR from .local_config import ( TOOLTIP_TEXT_TEMPLATE, TOOLTIP_IMAGE_TEMPLATE, @@ -18,6 +17,8 @@ TOOLTIP_LABEL_TEMPLATE, TOOLTIP_COORDS_DIV, TOOLTIP_INDEX_DIV, + BOKEH_PALETTE_USAGE, + BOKEH_PALETTE, ) @@ -26,11 +27,10 @@ def auto_label_color(labels): ???+ note "Create a label->hex color mapping dict." """ use_labels = set(labels) - use_labels.discard(module_config.ABSTAIN_DECODED) + use_labels.discard(ABSTAIN_DECODED) use_labels = sorted(use_labels, reverse=False) - palette = hover.config["visual"]["bokeh_palette"] - usage = hover.config["visual"]["bokeh_palette_usage"] + palette, usage = BOKEH_PALETTE, BOKEH_PALETTE_USAGE nlabels, ncolors = len(use_labels), len(palette) assert nlabels <= ncolors, f"Too many labels to support (max at {len(palette)})" @@ -50,7 +50,7 @@ def auto_label_color(labels): use_palette = [palette[i] for i in use_palette_idx] color_dict = { - module_config.ABSTAIN_DECODED: module_config.ABSTAIN_HEXCOLOR, + ABSTAIN_DECODED: ABSTAIN_HEXCOLOR, **{_l: _c for _l, _c in zip(use_labels, use_palette)}, } return color_dict diff --git a/hover/utils/bokeh_helper/local_config.py b/hover/utils/bokeh_helper/local_config.py index 78fb5991..3d622034 100644 --- a/hover/utils/bokeh_helper/local_config.py +++ b/hover/utils/bokeh_helper/local_config.py @@ -1,3 +1,13 @@ +import hover +from hover.config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) + + +BOKEH_PALETTE = hover.config[Section.VISUAL][Key.BOKEH_PALETTE] +BOKEH_PALETTE_USAGE = hover.config[Section.VISUAL][Key.BOKEH_PALETTE_USAGE] + TOOLTIP_TEXT_TEMPLATE = """