From ff7a16e8e4729223533e63cfd9df71b89fe19050 Mon Sep 17 00:00:00 2001 From: Pavel <74003834+phurwicz@users.noreply.github.com> Date: Fri, 28 Apr 2023 16:37:58 -0400 Subject: [PATCH] Support polars as DataFrame engine (#67) and adjust all tests and doc scripts --- .github/workflows/cross-os-conda-build.yml | 2 +- .github/workflows/cross-os-install-source.yml | 2 +- .github/workflows/cross-os-source-test.yml | 9 +- .github/workflows/doc-auto-notebook.yml | 2 +- .github/workflows/doc-script-test.yml | 2 +- .github/workflows/quick-source-test.yml | 9 +- docs/snippets/py/g0-4a-reduction-print.txt | 2 +- docs/snippets/py/t0-0a-dataset-text-print.txt | 2 +- docs/snippets/py/t0-1a-vectorizer-print.txt | 2 +- docs/snippets/py/t0-2a-reduction-print.txt | 2 +- .../py/t3-2-dataset-selection-table.txt | 2 +- hover/__init__.py | 93 +-- hover/config_constants.py | 78 +++ hover/core/dataset.py | 211 ++++--- hover/core/explorer/base.py | 80 ++- hover/core/explorer/feature.py | 19 +- hover/core/explorer/functionality.py | 104 +-- hover/core/explorer/local_config.py | 12 +- hover/core/explorer/specialization.py | 30 +- hover/core/local_config.py | 13 +- hover/core/neural.py | 11 +- hover/core/representation/local_config.py | 10 + hover/core/representation/manifold.py | 4 +- hover/core/representation/reduction.py | 18 +- hover/module_config.py | 25 +- hover/recipes/local_config.py | 9 + hover/recipes/subroutine.py | 21 +- hover/utils/bokeh_helper/__init__.py | 12 +- hover/utils/bokeh_helper/local_config.py | 10 + hover/utils/dataframe.py | 590 ++++++++++++++++++ hover/utils/typecheck.py | 21 + setup.py | 4 +- tests/conftest.py | 78 ++- tests/core/explorer/local_helper.py | 6 +- tests/core/explorer/test_functionality.py | 23 +- tests/core/representation/test_reduction.py | 2 +- tests/core/test_dataset.py | 16 +- tests/core/test_neural.py | 3 +- tests/module_config/hover_alt_config_1.ini | 9 +- tests/recipes/local_helper.py | 9 +- tests/recipes/test_experimental.py | 10 +- tests/recipes/test_stable.py | 19 +- tests/utils/test_dataframe.py | 502 +++++++++++++++ tests/utils/test_snorkel_helper.py | 2 +- tests/utils/test_typecheck.py | 28 + 45 files changed, 1778 insertions(+), 340 deletions(-) create mode 100644 hover/config_constants.py create mode 100644 hover/recipes/local_config.py create mode 100644 hover/utils/dataframe.py create mode 100644 hover/utils/typecheck.py create mode 100644 tests/utils/test_dataframe.py create mode 100644 tests/utils/test_typecheck.py diff --git a/.github/workflows/cross-os-conda-build.yml b/.github/workflows/cross-os-conda-build.yml index a0d7bd21..ffb35a5f 100644 --- a/.github/workflows/cross-os-conda-build.yml +++ b/.github/workflows/cross-os-conda-build.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.8', '3.10'] os: [ubuntu-latest, macos-latest, windows-latest] steps: diff --git a/.github/workflows/cross-os-install-source.yml b/.github/workflows/cross-os-install-source.yml index f7e6c468..9df6f1ac 100644 --- a/.github/workflows/cross-os-install-source.yml +++ b/.github/workflows/cross-os-install-source.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.8', '3.10'] os: [ubuntu-latest, macos-latest, windows-latest] steps: diff --git a/.github/workflows/cross-os-source-test.yml b/.github/workflows/cross-os-source-test.yml index 82346997..7daf9ec6 100644 --- a/.github/workflows/cross-os-source-test.yml +++ b/.github/workflows/cross-os-source-test.yml @@ -25,8 +25,15 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Test with Tox + - name: Get dependencies run: | pip install --upgrade pip pip install --upgrade tox tox-gh-actions + + - name: Test - default config + run: | tox -e test_api + + - name: Test - alt config 1 + run: | + tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini diff --git a/.github/workflows/doc-auto-notebook.yml b/.github/workflows/doc-auto-notebook.yml index 9554ae02..4d174494 100644 --- a/.github/workflows/doc-auto-notebook.yml +++ b/.github/workflows/doc-auto-notebook.yml @@ -14,7 +14,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.8'] + python-version: ['3.9'] os: [ubuntu-latest] steps: diff --git a/.github/workflows/doc-script-test.yml b/.github/workflows/doc-script-test.yml index 96361caf..c305e1d5 100644 --- a/.github/workflows/doc-script-test.yml +++ b/.github/workflows/doc-script-test.yml @@ -31,7 +31,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.8', '3.9'] + python-version: ['3.9'] os: [ubuntu-latest] steps: diff --git a/.github/workflows/quick-source-test.yml b/.github/workflows/quick-source-test.yml index 23b2b7b9..777a5e7a 100644 --- a/.github/workflows/quick-source-test.yml +++ b/.github/workflows/quick-source-test.yml @@ -47,12 +47,19 @@ jobs: restore-keys: | ${{ runner.os }}-${{ runner.python-version }}-tox-env- - - name: Test and make coverage report + - name: Get dependencies run: | pip install --upgrade pip pip install --upgrade tox tox-gh-actions + + - name: Test - default config + run: | tox -e test_api + - name: Test - alt config 1 + run: | + tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini + - name: Codacy Coverage Reporter uses: codacy/codacy-coverage-reporter-action@master if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'push' }} diff --git a/docs/snippets/py/g0-4a-reduction-print.txt b/docs/snippets/py/g0-4a-reduction-print.txt index 3c22c070..bf4fd674 100644 --- a/docs/snippets/py/g0-4a-reduction-print.txt +++ b/docs/snippets/py/g0-4a-reduction-print.txt @@ -1 +1 @@ -dataset.dfs["raw"].head(5) +dataset.dfs["raw"]().head(5) diff --git a/docs/snippets/py/t0-0a-dataset-text-print.txt b/docs/snippets/py/t0-0a-dataset-text-print.txt index 2b561258..3fe85391 100644 --- a/docs/snippets/py/t0-0a-dataset-text-print.txt +++ b/docs/snippets/py/t0-0a-dataset-text-print.txt @@ -1,2 +1,2 @@ # each subset can be accessed as its own DataFrame -dataset.dfs["raw"].head(5) +dataset.dfs["raw"]().head(5) diff --git a/docs/snippets/py/t0-1a-vectorizer-print.txt b/docs/snippets/py/t0-1a-vectorizer-print.txt index 5cf611a9..1a4664e9 100644 --- a/docs/snippets/py/t0-1a-vectorizer-print.txt +++ b/docs/snippets/py/t0-1a-vectorizer-print.txt @@ -1,4 +1,4 @@ -text = dataset.dfs["raw"].loc[0, "text"] +text = dataset.dfs["raw"]().loc[0, "text"] vec = vectorizer(text) print(f"Text: {text}") print(f"Vector shape: {vec.shape}") diff --git a/docs/snippets/py/t0-2a-reduction-print.txt b/docs/snippets/py/t0-2a-reduction-print.txt index 3447b46a..d26644f6 100644 --- a/docs/snippets/py/t0-2a-reduction-print.txt +++ b/docs/snippets/py/t0-2a-reduction-print.txt @@ -1,2 +1,2 @@ # what we did adds 'embed_2d_0' and 'embed_2d_1' columns to the DataFrames in dataset.dfs -dataset.dfs["raw"].head(5) +dataset.dfs["raw"]().head(5) diff --git a/docs/snippets/py/t3-2-dataset-selection-table.txt b/docs/snippets/py/t3-2-dataset-selection-table.txt index 5f262797..13b7239c 100644 --- a/docs/snippets/py/t3-2-dataset-selection-table.txt +++ b/docs/snippets/py/t3-2-dataset-selection-table.txt @@ -1,3 +1,3 @@ -dataset._callback_update_selection(dataset.dfs["raw"].loc[:10]) +dataset._callback_update_selection(dataset.dfs["raw"][:10]) show(dataset.sel_table, notebook_url=notebook_url) diff --git a/hover/__init__.py b/hover/__init__.py index 0b3fd522..77ed7e0d 100644 --- a/hover/__init__.py +++ b/hover/__init__.py @@ -1,124 +1,145 @@ """ Module root where constants get configured. """ -import re +from .config_constants import ( + ConfigSection, + ConfigKey, + Validator, + Preprocessor, +) from flexmod import AutolockedConfigValue, Config, ConfigIndex from bokeh.palettes import Turbo256 + config = ConfigIndex( [ Config( - "io", + ConfigSection.IO, [ AutolockedConfigValue( - "data_save_dir", + ConfigKey.DATA_SAVE_DIR, "The directory path for saving labeled data.", ".", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, + ), + ], + ), + Config( + ConfigSection.BACKEND, + [ + AutolockedConfigValue( + ConfigKey.DATAFRAME_LIBRARY, + "The library to use for internal dataframes. Must be 'pandas' or 'polars'.", + "pandas", + validation=Validator.is_supported_dataframe_library, + preprocessor=Preprocessor.lower, ), ], ), Config( - "visual", + ConfigSection.VISUAL, [ AutolockedConfigValue( - "abstain_hexcolor", + ConfigKey.ABSTAIN_HEXCOLOR, "Hex code of RGB color.", "#dcdcdc", - validation=lambda x: bool(re.match(r"^\#[0-9a-fA-F]{6}$", x)), + validation=Validator.is_hex_color, + preprocessor=Preprocessor.lower, ), AutolockedConfigValue( - "bokeh_palette", + ConfigKey.BOKEH_PALETTE, "The bokeh color palette to use for plotting. This should be a list of hex color codes.", Turbo256, - validation=lambda x: hasattr(x, "__iter__"), + validation=Validator.is_iterable_of_hex_color, ), AutolockedConfigValue( - "bokeh_palette_usage", + ConfigKey.BOKEH_PALETTE_USAGE, "Specify how colors from the palette should be chosen when there are fewer categories than colors. This needs to be 'iterate' or 'linspace'", "linspace", - validation=lambda x: x in ["iterate", "linspace"], + validation=Validator.is_supported_traversal_mode, + preprocessor=Preprocessor.lower, ), AutolockedConfigValue( - "table_img_style", + ConfigKey.TABLE_IMG_STYLE, "HTML style of images shown in selection tables.", "max-height: 100%; max-width: 100%; object-fit: contain", - preprocessor=lambda x: re.sub(r"(^[\'\"]|[\'\"]$)", "", x), + preprocessor=Preprocessor.remove_quote_at_ends, ), AutolockedConfigValue( - "tooltip_img_style", + ConfigKey.TOOLTIP_IMG_STYLE, "HTML style of images shown in mouse-over-data-point tooltips.", "float: left; margin: 2px 2px 2px 2px; width: 60px; height: 60px;", - preprocessor=lambda x: re.sub(r"(^[\'\"]|[\'\"]$)", "", x), + preprocessor=Preprocessor.remove_quote_at_ends, ), ], ), Config( - "data.embedding", + ConfigSection.DATA_EMBEDDING, [ AutolockedConfigValue( - "default_reduction_method", + ConfigKey.DEFAULT_REDUCTION_METHOD, "Default method for dimensionality reduction. Currently either 'umap' or 'ivis'.", "umap", - validation=lambda x: x in ["umap", "ivis"], + validation=Validator.is_supported_dimensionality_reduction, + preprocessor=Preprocessor.lower, ), ], ), Config( - "data.columns", + ConfigSection.DATA_COLUMNS, [ AutolockedConfigValue( - "encoded_label_key", + ConfigKey.ENCODED_LABEL_KEY, "The column name for the encoded label.", "label_encoded", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "dataset_subset_field", + ConfigKey.DATASET_SUBSET_FIELD, "The column name for dataset subsets.", "SUBSET", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "embedding_field_prefix", + ConfigKey.EMBEDDING_FIELD_PREFIX, "The prefix of column names for embedding coordinates.", "embed_", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "source_color_field", + ConfigKey.SOURCE_COLOR_FIELD, "The column name for plotted data point color.", "__COLOR__", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "source_alpha_field", + ConfigKey.SOURCE_ALPHA_FIELD, "The column name for plotted data point color alpha (opacity).", "__ALPHA__", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "search_score_field", + ConfigKey.SEARCH_SCORE_FIELD, "The column name for data points' score from search widgets.", "__SEARCH_SCORE__", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), ], ), Config( - "data.values", + ConfigSection.DATA_VALUES, [ AutolockedConfigValue( - "abstain_decoded", + ConfigKey.ABSTAIN_DECODED, "The placeholder label indicating 'no label yet'.", "ABSTAIN", - validation=lambda x: isinstance(x, str), + validation=Validator.is_str, ), AutolockedConfigValue( - "abstain_encoded", + ConfigKey.ABSTAIN_ENCODED, "The encoded value of 'no label yet' which should almost always be -1, never 0 or positive.", -1, - validation=lambda x: isinstance(x, int) and x < 0, + validation=Validator.is_negative_int, ), ], ), diff --git a/hover/config_constants.py b/hover/config_constants.py new file mode 100644 index 00000000..e7c80ace --- /dev/null +++ b/hover/config_constants.py @@ -0,0 +1,78 @@ +import re + + +class ConfigSection: + IO = "io" + BACKEND = "backend" + VISUAL = "visual" + DATA_EMBEDDING = "data.embedding" + DATA_COLUMNS = "data.columns" + DATA_VALUES = "data.values" + + +class ConfigKey: + DATA_SAVE_DIR = "data_save_dir" + DATAFRAME_LIBRARY = "dataframe_library" + ABSTAIN_HEXCOLOR = "abstain_hexcolor" + BOKEH_PALETTE = "bokeh_palette" + BOKEH_PALETTE_USAGE = "bokeh_palette_usage" + TABLE_IMG_STYLE = "table_img_style" + TOOLTIP_IMG_STYLE = "tooltip_img_style" + DEFAULT_REDUCTION_METHOD = "default_reduction_method" + ENCODED_LABEL_KEY = "encoded_label_key" + DATASET_SUBSET_FIELD = "dataset_subset_field" + EMBEDDING_FIELD_PREFIX = "embedding_field_prefix" + SOURCE_COLOR_FIELD = "source_color_field" + SOURCE_ALPHA_FIELD = "source_alpha_field" + SEARCH_SCORE_FIELD = "search_score_field" + ABSTAIN_DECODED = "abstain_decoded" + ABSTAIN_ENCODED = "abstain_encoded" + + +class Validator: + @staticmethod + def is_hex_color(x): + return bool(re.match(r"^\#[0-9a-fA-F]{6}$", x)) + + @staticmethod + def is_iterable(x): + return hasattr(x, "__iter__") + + @staticmethod + def is_iterable_of_hex_color(x): + if not Validator.is_iterable(x): + return False + for i in x: + if not Validator.is_hex_color(i): + return False + return True + + @staticmethod + def is_supported_dataframe_library(x): + return x in ["pandas", "polars"] + + @staticmethod + def is_supported_dimensionality_reduction(x): + return x in ["umap", "ivis"] + + @staticmethod + def is_supported_traversal_mode(x): + return x in ["iterate", "linspace"] + + @staticmethod + def is_str(x): + return isinstance(x, str) + + @staticmethod + def is_negative_int(x): + return isinstance(x, int) and x < 0 + + +class Preprocessor: + @staticmethod + def remove_quote_at_ends(x): + return re.sub(r"(^[\'\"]|[\'\"]$)", "", x) + + @staticmethod + def lower(x): + return x.lower() diff --git a/hover/core/dataset.py b/hover/core/dataset.py index b73c0b8d..79df357d 100644 --- a/hover/core/dataset.py +++ b/hover/core/dataset.py @@ -11,15 +11,20 @@ - loading data for training models """ import os -import hover +import operator import numpy as np -import pandas as pd from tqdm import tqdm from collections import OrderedDict -from hover import module_config +from hover.module_config import ( + DataFrame as DF, + ABSTAIN_DECODED, + ABSTAIN_ENCODED, + DATA_SAVE_DIR, +) from hover.core import Loggable from hover.utils.bokeh_helper import auto_label_color from hover.utils.misc import current_time +from hover.utils.typecheck import TypedValueDict from bokeh.models import ( Button, CheckboxGroup, @@ -35,6 +40,7 @@ dataset_default_sel_table_kwargs, COLOR_GLYPH_TEMPLATE, DATASET_SUBSET_FIELD, + DEFAULT_REDUCTION_METHOD, embedding_field, ) @@ -112,7 +118,7 @@ def setup_dfs( | `label_key` | `str` | the key for the `**str**` label in supervised data | """ - def dictl_transform(dictl, labels=True): + def dictl_transform(dictl, subset, labels=True): """ Burner function to transform the input list of dictionaries into standard format. """ @@ -135,31 +141,56 @@ def burner(d): trans_d = {key_transform.get(_k, _k): _v for _k, _v in d.items()} if not labels: - trans_d["label"] = module_config.ABSTAIN_DECODED + trans_d["label"] = ABSTAIN_DECODED + + trans_d[DATASET_SUBSET_FIELD] = subset return trans_d return [burner(_d) for _d in dictl] # standardize records - dictls = { - "raw": dictl_transform(raw_dictl, labels=False), - "train": dictl_transform(train_dictl), - "dev": dictl_transform(dev_dictl), - "test": dictl_transform(test_dictl), - } + dictls = [ + *dictl_transform(raw_dictl, "raw", labels=False), + *dictl_transform(train_dictl, "train"), + *dictl_transform(dev_dictl, "dev"), + *dictl_transform(test_dictl, "test"), + ] + all_subsets_df = DF.construct(dictls) + + assert all_subsets_df.shape[0] > 0, "Expected non-empty dataset" + assert ( + self.__class__.FEATURE_KEY in all_subsets_df.columns + ), f"Expected feature key {self.__class__.FEATURE_KEY}" + assert "label" in all_subsets_df.columns, "Expected label key 'label'" # initialize dataframes - self.dfs = dict() - for _key, _dictl in dictls.items(): - if _dictl: - _df = pd.DataFrame(_dictl) - assert self.__class__.FEATURE_KEY in _df.columns - assert "label" in _df.columns - else: - _df = pd.DataFrame(columns=[self.__class__.FEATURE_KEY, "label"]) + self.dfs = TypedValueDict(DF) + for _key in ["raw", "train", "dev", "test"]: + self.dfs[_key] = all_subsets_df.filter_rows_by_operator( + DATASET_SUBSET_FIELD, operator.eq, _key + ) - self.dfs[_key] = _df + @property + def dfs(self): + """ + ???+ note "Subset -> DataFrame mapping." + """ + return self._dfs + + @dfs.setter + def dfs(self, dfs): + assert isinstance( + dfs, TypedValueDict + ), f"Expected TypedValueDict, got {type(dfs)}" + assert not hasattr(self, "_dfs"), "Resetting `dfs` is forbidden." + self._dfs = dfs + + def subset(self, key): + """ + ???+ note "Return the DataFrame by reference for the given subset." + """ + return self.dfs[key] def copy(self): """ @@ -179,7 +210,7 @@ def compute_feature_index(self): """ feature_to_subset_idx = {} for _subset, _df in self.dfs.items(): - _values = _df[self.__class__.FEATURE_KEY].values + _values = DF.series_values(_df[self.__class__.FEATURE_KEY]) for i, _val in enumerate(_values): if _val in feature_to_subset_idx: raise ValueError( @@ -196,7 +227,9 @@ def locate_by_feature_value(self, value, auto_recompute=True): """ subset, index = self.feature_to_subset_idx[value] - current_value = self.dfs[subset].at[index, self.__class__.FEATURE_KEY] + current_value = self.dfs[subset].get_cell_by_row_column( + index, self.__class__.FEATURE_KEY + ) if current_value != value: if auto_recompute: self._warn("locate_by_feature_value mismatch. Recomputing index.") @@ -211,13 +244,7 @@ def to_pandas(self): """ ???+ note "Export to a pandas DataFrame." """ - dfs = [] - for _subset in ["raw", "train", "dev", "test"]: - _df = self.dfs[_subset].copy() - _df[DATASET_SUBSET_FIELD] = _subset - dfs.append(_df) - - return pd.concat(dfs, axis=0) + return DF.concat_rows(self.dfs.values()).to_pandas() @classmethod def from_pandas(cls, df, **kwargs): @@ -229,11 +256,11 @@ def from_pandas(cls, df, **kwargs): """ SUBSETS = cls.SCRATCH_SUBSETS + cls.PUBLIC_SUBSETS + cls.PRIVATE_SUBSETS - if DATASET_SUBSET_FIELD not in df.columns: - raise ValueError( - f"Expecting column '{DATASET_SUBSET_FIELD}' in the DataFrame which takes values from {SUBSETS}" - ) + assert ( + DATASET_SUBSET_FIELD in df.columns + ), f"Expecting column '{DATASET_SUBSET_FIELD}' in the DataFrame which takes values from {SUBSETS}" + # use the 'silly' approach for robustness dictls = {} for _subset in ["raw", "train", "dev", "test"]: _sub_df = df[df[DATASET_SUBSET_FIELD] == _subset] @@ -415,22 +442,18 @@ def callback_commit(event): ) return - sel_slice = self.dfs[sub_k].iloc[selected_idx] - valid_slice = sel_slice[ - sel_slice["label"] != module_config.ABSTAIN_DECODED - ] + sel_slice = self.dfs[sub_k].select_rows(selected_idx) + valid_slice = sel_slice.filter_rows_by_operator( + "label", operator.ne, ABSTAIN_DECODED + ) # concat to the end and do some accounting size_before = self.dfs[sub_to].shape[0] - self.dfs[sub_to] = pd.concat( - [self.dfs[sub_to], valid_slice], - axis=0, - sort=False, - ignore_index=True, - ) + self.dfs[sub_to] = DF.concat_rows([self.dfs[sub_to], valid_slice]) size_mid = self.dfs[sub_to].shape[0] - self.dfs[sub_to].drop_duplicates( - subset=[self.__class__.FEATURE_KEY], keep="last", inplace=True + self.dfs[sub_to] = self.dfs[sub_to].unique( + subset=[self.__class__.FEATURE_KEY], + keep="last", ) size_after = self.dfs[sub_to].shape[0] @@ -463,10 +486,10 @@ def callback_view(): sel_slices = [] for subset in subsets: selected_idx = sorted(explorer.sources[subset].selected.indices) - sub_slice = explorer.dfs[subset].iloc[selected_idx] + sub_slice = explorer.dfs[subset].select_rows(selected_idx) sel_slices.append(sub_slice) - selected = pd.concat(sel_slices, axis=0) + selected = DF.concat_rows(sel_slices) self._callback_update_selection(selected) def callback_view_refresh(): @@ -519,15 +542,15 @@ def setup_label_coding(self, verbose=True, debug=False): all_labels = set() for _key in [*self.__class__.PUBLIC_SUBSETS, *self.__class__.PRIVATE_SUBSETS]: _df = self.dfs[_key] - _found_labels = set(_df["label"].tolist()) + _found_labels = set(DF.series_tolist(_df["label"])) all_labels = all_labels.union(_found_labels) # exclude ABSTAIN from self.classes, but include it in the encoding - all_labels.discard(module_config.ABSTAIN_DECODED) + all_labels.discard(ABSTAIN_DECODED) self.classes = sorted(all_labels) self.label_encoder = { **{_label: _i for _i, _label in enumerate(self.classes)}, - module_config.ABSTAIN_DECODED: module_config.ABSTAIN_ENCODED, + ABSTAIN_DECODED: ABSTAIN_ENCODED, } self.label_decoder = {_v: _k for _k, _v in self.label_encoder.items()} @@ -549,14 +572,16 @@ def validate_labels(self, raise_exception=True): for _key in [*self.__class__.PUBLIC_SUBSETS, *self.__class__.PRIVATE_SUBSETS]: _invalid_indices = None assert "label" in self.dfs[_key].columns - _mask = self.dfs[_key]["label"].apply( - lambda x: int(x in self.label_encoder) + _mask = np.logical_not( + self.dfs[_key].column_isin( + "label", + self.label_encoder.keys(), + ) ) - # DO NOT change the "==" to "is"; False in pandas is not False below - _invalid_indices = np.where(_mask == 0)[0].tolist() + _invalid_indices = np.where(_mask)[0].tolist() if _invalid_indices: self._fail(f"Subset {_key} has invalid labels:") - self._print(self.dfs[_key].loc[_invalid_indices]) + self._print(self.dfs[_key].select_rows(_invalid_indices)()) if raise_exception: raise ValueError("invalid labels") @@ -579,7 +604,7 @@ def callback_export(event, path_root=None): # auto-determine the export path root if path_root is None: timestamp = current_time("%Y%m%d%H%M%S") - export_dir = module_config.DATA_SAVE_DIR + export_dir = DATA_SAVE_DIR path_root = os.path.join(export_dir, f"hover-dataset-{timestamp}") export_df = self.to_pandas() @@ -641,13 +666,13 @@ def update_population(): self.setup_label_coding() # re-compute label population - eff_labels = [module_config.ABSTAIN_DECODED, *self.classes] + eff_labels = [ABSTAIN_DECODED, *self.classes] color_dict = auto_label_color(self.classes) eff_colors = [color_dict[_label] for _label in eff_labels] pop_data = dict(color=eff_colors, label=eff_labels) for _subset in subsets: - _subpop = self.dfs[_subset]["label"].value_counts() + _subpop = self.dfs[_subset].column_counter("label") pop_data[f"count_{_subset}"] = [ _subpop.get(_label, 0) for _label in eff_labels ] @@ -686,7 +711,7 @@ def update_selection(selected_df): """ To be triggered as a subroutine of `self.selection_viewer`. """ - sel_source.data = selected_df.to_dict(orient="list") + sel_source.data = selected_df.to_dict_of_lists() # now that selection table has changed, clear sub-selection sel_source.selected.indices = [] @@ -703,7 +728,9 @@ def patch_edited_selection(): feature_value = sel_source.data[self.__class__.FEATURE_KEY][i] subset, idx = self.locate_by_feature_value(feature_value) for key in sel_source.data.keys(): - self.dfs[subset].at[idx, key] = sel_source.data[key][i] + self.dfs[subset].set_cell_by_row_column( + idx, key, sel_source.data[key][i] + ) self._good(f"Selection table: edited {len(raw_indices)} dataset rows.") # if edited labels (which is common), then population has changed @@ -731,21 +758,17 @@ def df_deduplicate(self): for _key in ordered_subsets: before[_key] = self.dfs[_key].shape[0] columns[_key] = self.dfs[_key].columns - self.dfs[_key]["__subset"] = _key + # update subset for rows that have been copied / moved + self.dfs[_key].set_column_by_constant(DATASET_SUBSET_FIELD, _key) # concatenate in order and deduplicate - overall_df = pd.concat( - [self.dfs[_key] for _key in ordered_subsets], axis=0, sort=False - ) - overall_df.drop_duplicates( - subset=[self.__class__.FEATURE_KEY], keep="last", inplace=True - ) - overall_df.reset_index(drop=True, inplace=True) + overall_df = DF.concat_rows([self.dfs[_key] for _key in ordered_subsets]) + overall_df = overall_df.unique(subset=[self.__class__.FEATURE_KEY], keep="last") # cut up slices for _key in ordered_subsets: - self.dfs[_key] = overall_df[overall_df["__subset"] == _key].reset_index( - drop=True, inplace=False + self.dfs[_key] = overall_df.filter_rows_by_operator( + DATASET_SUBSET_FIELD, operator.eq, _key )[columns[_key]] after[_key] = self.dfs[_key].shape[0] self._info(f"--subset {_key} rows: {before[_key]} -> {after[_key]}.") @@ -760,7 +783,9 @@ def vectorizer_lookup(self): def vectorizer_lookup(self, *args, **kwargs): self._fail("assigning vectorizer lookup by reference is forbidden.") - def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs): + def compute_nd_embedding( + self, vectorizer, method=DEFAULT_REDUCTION_METHOD, dimension=2, **kwargs + ): """ ???+ note "Get embeddings in n-dimensional space and return the dimensionality reducer." Reference: [`DimensionalityReducer`](https://github.com/phurwicz/hover/blob/main/hover/core/representation/reduction.py) @@ -774,9 +799,12 @@ def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs): """ from hover.core.representation.reduction import DimensionalityReducer - if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] # register the vectorizer for scenarios that may need it + assert ( + isinstance(dimension, int) and dimension >= 2 + ), "Invalid dimension {dimension}}" + if dimension in self.vectorizer_lookup: + self._warn(f"Overwriting embedding with dimension {dimension}.") self.vectorizer_lookup[dimension] = vectorizer # prepare input vectors to manifold learning @@ -784,16 +812,19 @@ def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs): trans_subset = [*self.__class__.PRIVATE_SUBSETS] assert not set(fit_subset).intersection(set(trans_subset)), "Unexpected overlap" - assert isinstance(dimension, int) and dimension >= 2 embedding_cols = [embedding_field(dimension, i) for i in range(dimension)] # compute vectors and keep track which where to slice the array for fitting feature_inp = [] for _key in fit_subset: - feature_inp.extend(self.dfs[_key][self.__class__.FEATURE_KEY].tolist()) + feature_inp.extend( + DF.series_tolist(self.dfs[_key][self.__class__.FEATURE_KEY]) + ) fit_num = len(feature_inp) for _key in trans_subset: - feature_inp.extend(self.dfs[_key][self.__class__.FEATURE_KEY].tolist()) + feature_inp.extend( + DF.series_tolist(self.dfs[_key][self.__class__.FEATURE_KEY]) + ) trans_arr = np.array( [vectorizer(_inp) for _inp in tqdm(feature_inp, desc="Vectorizing")] ) @@ -810,12 +841,12 @@ def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs): trans_embedding = reducer.transform(trans_arr[fit_num:], method) # assign x and y coordinates to dataset - start_idx = 0 for _subset, _embedding in [ (fit_subset, fit_embedding), (trans_subset, trans_embedding), ]: - # edge case: embedding is too small + start_idx = 0 + # edge case: embedding has no rows if _embedding.shape[0] < 1: for _key in _subset: assert ( @@ -824,17 +855,25 @@ def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs): continue for _key in _subset: _length = self.dfs[_key].shape[0] + _embedding_slice = _embedding[start_idx : (start_idx + _length), :] + assert ( + _length == _embedding_slice.shape[0] + ), f"Unexpected length {_length} vs {_embedding_slice.shape}; embedding total {_embedding.shape}" for _i in range(dimension): _col = embedding_cols[_i] - self.dfs[_key][_col] = pd.Series( - _embedding[start_idx : (start_idx + _length), _i] + self.dfs[_key].set_column_by_array( + _col, + _embedding_slice[:, _i], + indices=None, ) start_idx += _length self._good(f"Computed {dimension}-d embedding in columns {embedding_cols}") return reducer - def compute_2d_embedding(self, vectorizer, method=None, **kwargs): + def compute_2d_embedding( + self, vectorizer, method=DEFAULT_REDUCTION_METHOD, **kwargs + ): """ ???+ note "Get embeddings in the xy-plane and return the dimensionality reducer." A special case of `compute_nd_embedding`. @@ -846,7 +885,7 @@ def compute_2d_embedding(self, vectorizer, method=None, **kwargs): | `**kwargs` | | kwargs for `DimensionalityReducer` | """ reducer = self.compute_nd_embedding( - vectorizer, method=None, dimension=2, **kwargs + vectorizer, method=method, dimension=2, **kwargs ) return reducer @@ -869,7 +908,9 @@ def loader(self, key, *vectorizers, batch_size=64, smoothing_coeff=0.0): ) # take the slice that has a meaningful label - df = self.dfs[key][self.dfs[key]["label"] != module_config.ABSTAIN_DECODED] + df = self.dfs[key].filter_rows_by_operator( + "label", operator.ne, ABSTAIN_DECODED + ) # edge case: valid slice is too small if df.shape[0] < 1: @@ -877,7 +918,7 @@ def loader(self, key, *vectorizers, batch_size=64, smoothing_coeff=0.0): batch_size = min(batch_size, df.shape[0]) # prepare output vectors - labels = df["label"].apply(lambda x: self.label_encoder[x]).tolist() + labels = df.column_map("label", self.label_encoder, output="list") output_vectors = one_hot(labels, num_classes=len(self.classes)) if smoothing_coeff > 0.0: output_vectors = label_smoothing( @@ -887,7 +928,7 @@ def loader(self, key, *vectorizers, batch_size=64, smoothing_coeff=0.0): # prepare input vectors assert len(vectorizers) > 0, "Expected at least one vectorizer" multi_flag = len(vectorizers) > 1 - features = df[self.__class__.FEATURE_KEY].tolist() + features = DF.series_tolist(df[self.__class__.FEATURE_KEY]) input_vector_lists = [] for _vec_func in vectorizers: diff --git a/hover/core/explorer/base.py b/hover/core/explorer/base.py index 11843236..f8b5470f 100644 --- a/hover/core/explorer/base.py +++ b/hover/core/explorer/base.py @@ -1,7 +1,6 @@ """ ???+ note "Base class(es) for ALL explorer implementations." """ -import pandas as pd from abc import ABC, abstractmethod from collections import OrderedDict, defaultdict from bokeh.events import SelectionGeometry @@ -15,6 +14,8 @@ from hover.utils.bokeh_helper import bokeh_hover_tooltip from hover.utils.meta.traceback import RichTracebackABCMeta from hover.utils.misc import RootUnionFind +from hover.utils.typecheck import TypedValueDict +from hover.module_config import DataFrame from .local_config import SEARCH_SCORE_FIELD STANDARD_PLOT_TOOLS = [ @@ -51,7 +52,7 @@ class BokehBaseExplorer(Loggable, ABC, metaclass=RichTracebackABCMeta): SELECTION_PROCESSING_STAGES = ["save", "load", "write", "read"] PRIMARY_FEATURE = None - MANDATORY_COLUMNS = ["label"] + MANDATORY_COLUMN_TO_TYPE_DEFAULT = {"label": (str, None)} TOOLTIP_KWARGS = { "label": {"label": "Label"}, "coords": True, @@ -285,7 +286,7 @@ def adjust_slider(): col_patch in _df.columns ), f"Subset {_key} expecting column {col_patch} among columns, got {_df.columns}" # find all array lengths; note that the data subset can be empty - _num_patches_seen = _df[col_patch].apply(len).values + _num_patches_seen = _df.column_apply(col_patch, len, output="list") assert ( len(set(_num_patches_seen)) <= 1 ), f"Expecting consistent number of patches, got {_num_patches_seen}" @@ -309,7 +310,7 @@ def adjust_slider(): def update_patch(attr, old, new): for _key, _df in self.dfs.items(): # calculate the patch corresponding to slider value - _value = [_arr[new] for _arr in _df[col_patch].values] + _value = [_arr[new] for _arr in DataFrame.series_values(_df[col_patch])] _slice = slice(_df.shape[0]) _patch = {col_original: [(_slice, _value)]} self.sources[_key].patch(_patch) @@ -317,15 +318,36 @@ def update_patch(attr, old, new): slider.on_change("value", update_patch) self._good(f"Patching {col_original} using {col_patch}") - def _mandatory_column_defaults(self): + def _mandatory_column_info(self): """ - ???+ note "Mandatory columns and default values." + ???+ note "Mandatory columns, types, and default values." If default value is None, will raise exception if the column is not found. """ - return {_col: None for _col in self.__class__.MANDATORY_COLUMNS} + return { + _col: {"type": _type, "default": _default} + for _col, ( + _type, + _default, + ) in self.__class__.MANDATORY_COLUMN_TO_TYPE_DEFAULT.items() + } - def _setup_dfs(self, df_dict, copy=False): + @property + def dfs(self): + """ + ???+ note "Subset -> DataFrame mapping." + """ + return self._dfs + + @dfs.setter + def dfs(self, dfs): + assert isinstance( + dfs, TypedValueDict + ), f"Expected TypedValueDict, got {type(dfs)}" + assert not hasattr(self, "_dfs"), "Resetting `dfs` is forbidden." + self._dfs = dfs + + def _setup_dfs(self, df_dict): """ ???+ note "Check and store DataFrames **by reference by default**." Intended to be extended in child classes for pre/post processing. @@ -333,9 +355,10 @@ def _setup_dfs(self, df_dict, copy=False): | Param | Type | Description | | :---------- | :----- | :--------------------------- | | `df_dict` | `dict` | `str` -> `DataFrame` mapping | - | `copy` | `bool` | whether to copy `DataFrame`s | """ self._info("Setting up DataFrames") + for _df in df_dict.values(): + assert isinstance(_df, DataFrame), f"Expected DataFrame, got {type(_df)}" supplied_keys = set(df_dict.keys()) expected_keys = set(self.__class__.SUBSET_GLYPH_KWARGS.keys()) @@ -345,20 +368,21 @@ def _setup_dfs(self, df_dict, copy=False): expected_not_supplied = expected_keys.difference(supplied_keys) for _key in supplied_not_expected: - self._warn( - f"{self.__class__.__name__}.__init__(): got unexpected df key {_key}" - ) + self._warn(f"expected df keys {list(expected_keys)}, not {_key}") for _key in expected_not_supplied: - self._warn( - f"{self.__class__.__name__}.__init__(): missing expected df key {_key}" - ) + self._warn(f"expected df keys {list(expected_keys)}, missing {_key}") # assign df with column checks - self.dfs = dict() - mandatory_col_to_default = self._mandatory_column_defaults() + if not hasattr(self, "dfs"): + self.dfs = TypedValueDict(DataFrame) + else: + self.dfs.clear() + + mandatory_col_info = self._mandatory_column_info() for _key in expected_and_supplied: _df = df_dict[_key] - for _col, _default in mandatory_col_to_default.items(): + for _col, _dict in mandatory_col_info.items(): + _default = _dict["default"] # column exists: all good if _col in _df.columns: continue @@ -369,12 +393,14 @@ def _setup_dfs(self, df_dict, copy=False): assert _df.shape[0] == 0, _msg # default value available, will use it to create column else: - _df[_col] = _default - self.dfs[_key] = _df.copy() if copy else _df + _df.set_column_by_constant(_col, _default) + self.dfs[_key] = _df # expected dfs must be present for _key in expected_not_supplied: - _df = pd.DataFrame(columns=list(mandatory_col_to_default.keys())) + _df = DataFrame.empty_with_columns( + {col: _d["type"] for col, _d in mandatory_col_info.items()} + ) self.dfs[_key] = _df def _setup_sources(self): @@ -383,7 +409,10 @@ def _setup_sources(self): Intended to be extended in child classes for pre/post processing. """ self._info("Setting up sources") - self.sources = {_key: ColumnDataSource(_df) for _key, _df in self.dfs.items()} + self.sources = { + _key: ColumnDataSource(_df.to_dict_of_lists()) + for _key, _df in self.dfs.items() + } self._postprocess_sources() # initialize attributes that couple with sources @@ -536,7 +565,7 @@ def _update_sources(self): such as dynamic plotting kwargs, need to be re-assigned. """ for _key in self.dfs.keys(): - self.sources[_key].data = self.dfs[_key] + self.sources[_key].data = self.dfs[_key].to_dict_of_lists() self._postprocess_sources() # reset selections now that source indices may have changed @@ -845,6 +874,9 @@ def find_embedding_fields(self): else: # embedding columns must be the same across subsets assert embedding_cols == _emb_cols, "Inconsistent embedding columns" + assert ( + embedding_cols is not None + ), f"No embedding columns found: {[_df.columns for _df in self.dfs.values()]}" assert ( len(embedding_cols) >= 2 ), f"Expected at least two embedding columns, found {embedding_cols}" @@ -858,7 +890,7 @@ def auto_color_mapping(self): labels = set() for _key in self.dfs.keys(): - labels = labels.union(set(self.dfs[_key]["label"].values)) + labels = labels.union(set(DataFrame.series_values(self.dfs[_key]["label"]))) return auto_label_color(labels) diff --git a/hover/core/explorer/feature.py b/hover/core/explorer/feature.py index c61ce31e..ef6fca4c 100644 --- a/hover/core/explorer/feature.py +++ b/hover/core/explorer/feature.py @@ -2,11 +2,11 @@ ???+ note "Intermediate classes based on the main feature." """ import re -import hover import numpy as np from functools import lru_cache from bokeh.models import TextInput, Slider from .base import BokehBaseExplorer +from .local_config import TOOLTIP_IMG_STYLE class BokehForText(BokehBaseExplorer): @@ -23,7 +23,10 @@ class BokehForText(BokehBaseExplorer): """ PRIMARY_FEATURE = "text" - MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"] + MANDATORY_COLUMN_TO_TYPE_DEFAULT = { + PRIMARY_FEATURE: (str, None), + "label": (str, None), + } TOOLTIP_KWARGS = { "label": {"label": "Label"}, "text": {"text": "Text"}, @@ -174,7 +177,10 @@ class BokehForAudio(BokehForUrlToVector): """ PRIMARY_FEATURE = "audio" - MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"] + MANDATORY_COLUMN_TO_TYPE_DEFAULT = { + PRIMARY_FEATURE: (str, None), + "label": (str, None), + } TOOLTIP_KWARGS = { "label": {"label": "Label"}, "audio": {"audio": ""}, @@ -197,10 +203,13 @@ class BokehForImage(BokehForUrlToVector): """ PRIMARY_FEATURE = "image" - MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"] + MANDATORY_COLUMN_TO_TYPE_DEFAULT = { + PRIMARY_FEATURE: (str, None), + "label": (str, None), + } TOOLTIP_KWARGS = { "label": {"label": "Label"}, - "image": {"image": hover.config["visual"]["tooltip_img_style"]}, + "image": {"image": TOOLTIP_IMG_STYLE}, "coords": True, "index": True, } diff --git a/hover/core/explorer/functionality.py b/hover/core/explorer/functionality.py index 97131fa4..10ee314f 100644 --- a/hover/core/explorer/functionality.py +++ b/hover/core/explorer/functionality.py @@ -6,7 +6,10 @@ from bokeh.models import CDSView, IndexFilter, Dropdown, Button from bokeh.palettes import Category20 from bokeh.layouts import row -from hover import module_config +from hover.module_config import ( + DataFrame as DF, + ABSTAIN_DECODED, +) from hover.utils.misc import current_time from hover.utils.bokeh_helper import bokeh_hover_tooltip from .local_config import SOURCE_COLOR_FIELD, SOURCE_ALPHA_FIELD, SEARCH_SCORE_FIELD @@ -130,11 +133,7 @@ def _postprocess_sources(self): color_dict = self.auto_color_mapping() for _key, _df in self.dfs.items(): - _color = ( - _df["label"] - .apply(lambda label: color_dict.get(label, "gainsboro")) - .tolist() - ) + _color = _df.column_map("label", color_dict, output="list") self.sources[_key].add(_color, SOURCE_COLOR_FIELD) def _update_colors(self): @@ -146,11 +145,7 @@ def _update_colors(self): # infer glyph colors dynamically color_dict = self.auto_color_mapping() - color_list = ( - self.dfs["raw"]["label"] - .apply(lambda label: color_dict.get(label, "gainsboro")) - .tolist() - ) + color_list = self.dfs["raw"].column_map("label", color_dict, output="list") self.sources["raw"].patch( {SOURCE_COLOR_FIELD: [(slice(len(color_list)), color_list)]} ) @@ -188,7 +183,11 @@ def callback_apply(): self._info(f"applying {len(selected_idx)} annotations...") # update label in both the df and the data source - self.dfs["raw"].loc[selected_idx, "label"] = label + self.dfs["raw"].set_column_by_constant( + "label", + label, + indices=selected_idx, + ) patch_to_apply = [(_idx, label) for _idx in selected_idx] self.sources["raw"].patch({"label": patch_to_apply}) self._good(f"applied {len(selected_idx)} annotations: {label}") @@ -272,20 +271,20 @@ def _build_tooltip(self, specified): ) return specified - def _mandatory_column_defaults(self): + def _mandatory_column_info(self): """ - ???+ note "Mandatory columns and default values." + ???+ note "Mandatory columns, types, and default values." If default value is None, will raise exception if the column is not found. """ - column_to_value = super()._mandatory_column_defaults() - column_to_value.update( + column_info = super()._mandatory_column_info() + column_info.update( { - self.label_col: module_config.ABSTAIN_DECODED, - self.score_col: 0.5, + self.label_col: {"type": str, "default": ABSTAIN_DECODED}, + self.score_col: {"type": float, "default": 0.5}, } ) - return column_to_value + return column_info def _postprocess_sources(self): """ @@ -294,12 +293,9 @@ def _postprocess_sources(self): # infer glyph color from labels color_dict = self.auto_color_mapping() - def get_color(label): - return color_dict.get(label, "gainsboro") - # infer glyph alpha from pseudo-percentile of soft label scores scores = np.concatenate( - [_df[self.score_col].tolist() for _df in self.dfs.values()] + [DF.series_tolist(_df[self.score_col]) for _df in self.dfs.values()] ) scores_mean = scores.mean() scores_std = scores.std() + 1e-4 @@ -314,8 +310,8 @@ def pseudo_percentile(confidence, lower=0.1, upper=0.9): # infer alpha from score percentiles for _key, _df in self.dfs.items(): - _color = _df[self.label_col].apply(get_color).tolist() - _alpha = _df[self.score_col].apply(pseudo_percentile).tolist() + _color = _df.column_map(self.label_col, color_dict, output="list") + _alpha = _df.column_apply(self.score_col, pseudo_percentile, output="list") self.sources[_key].add(_color, SOURCE_COLOR_FIELD) self.sources[_key].add(_alpha, SOURCE_ALPHA_FIELD) @@ -346,6 +342,7 @@ def subroutine(df, lower, upper): """ Calculate indices with score between lower/upper bounds. """ + # note: comparing series with scalar is the same in pandas/polars keep_l = set(np.where(df[self.score_col] >= lower)[0]) keep_u = set(np.where(df[self.score_col] <= upper)[0]) kept = keep_l.intersection(keep_u) @@ -444,17 +441,17 @@ def __init__(self, df_dict, label_col_a, label_col_b, **kwargs): self.label_col_b = label_col_b super().__init__(df_dict, **kwargs) - def _mandatory_column_defaults(self): + def _mandatory_column_info(self): """ ???+ note "Mandatory columns and default values." If default value is None, will raise exception if the column is not found. """ - column_to_value = super()._mandatory_column_defaults() + column_to_value = super()._mandatory_column_info() column_to_value.update( { - self.label_col_a: None, - self.label_col_b: None, + self.label_col_a: {"type": str, "default": None}, + self.label_col_b: {"type": str, "default": None}, } ) return column_to_value @@ -476,10 +473,13 @@ def plot(self, label, **kwargs): eff_kwargs["legend_label"] = f"{label}" # create agreement/increment/decrement subsets - col_a_pos = np.where(self.dfs[_key][self.label_col_a] == label)[0].tolist() - col_a_neg = np.where(self.dfs[_key][self.label_col_a] != label)[0].tolist() - col_b_pos = np.where(self.dfs[_key][self.label_col_b] == label)[0].tolist() - col_b_neg = np.where(self.dfs[_key][self.label_col_b] != label)[0].tolist() + # note: comparing series with constant is the same in pandas/polars + mask_a = DF.series_values(self.dfs[_key][self.label_col_a] == label) + mask_b = DF.series_values(self.dfs[_key][self.label_col_b] == label) + col_a_pos = np.where(mask_a)[0].tolist() + col_a_neg = np.where(np.logical_not(mask_a))[0].tolist() + col_b_pos = np.where(mask_b)[0].tolist() + col_b_neg = np.where(np.logical_not(mask_b))[0].tolist() agreement_view = CDSView( source=_source, filters=[IndexFilter(col_a_pos), IndexFilter(col_b_pos)] ) @@ -653,13 +653,11 @@ def callback_apply(event): ) return - labels = self.dfs["raw"].iloc[selected_idx].apply(lf, axis=1).values - num_nontrivial = len( - list(filter(lambda l: l != module_config.ABSTAIN_DECODED, labels)) - ) + labels = self.dfs["raw"].row_apply(lf, indices=selected_idx, output="numpy") + num_nontrivial = len(list(filter(lambda l: l != ABSTAIN_DECODED, labels))) # update label in both the df and the data source - self.dfs["raw"].loc[selected_idx, "label"] = labels + self.dfs["raw"].set_column_by_array("label", labels, indices=selected_idx) for _idx, _label in zip(selected_idx, labels): _idx = int(_idx) self.sources["raw"].patch({"label": [(_idx, _label)]}) @@ -692,11 +690,13 @@ def callback_filter(event): for _key, _source in self.sources.items(): _selected = _source.selected.indices - _labels = self.dfs[_key].iloc[_selected].apply(lf, axis=1).values + _labels = self.dfs[_key].row_apply( + lf, indices=_selected, output="numpy" + ) _kept = [ _idx for _idx, _label in zip(_selected, _labels) - if _label != module_config.ABSTAIN_DECODED + if _label != ABSTAIN_DECODED ] self.sources[_key].selected.indices = _kept @@ -798,8 +798,8 @@ def refresh_glyphs(self, lf_name): assert lf_name in self.lf_data, f"trying to refresh non-existing LF: {lf_name}" lf = self.lf_data[lf_name]["lf"] - L_raw = self.dfs["raw"].apply(lf, axis=1).values - L_labeled = self.dfs["labeled"].apply(lf, axis=1).values + L_raw = self.dfs["raw"].row_apply(lf, output="numpy") + L_labeled = self.dfs["labeled"].row_apply(lf, output="numpy") glyph_codes = self.lf_data[lf_name]["glyphs"].keys() if "C" in glyph_codes: @@ -841,9 +841,9 @@ def plot_new_lf( # calculate predicted labels if not provided if L_raw is None: - L_raw = self.dfs["raw"].apply(lf, axis=1).values + L_raw = self.dfs["raw"].row_apply(lf, output="numpy") if L_labeled is None: - L_labeled = self.dfs["labeled"].apply(lf, axis=1).values + L_labeled = self.dfs["labeled"].row_apply(lf, output="numpy") # prepare plot settings assert self.palette, f"Palette depleted, # LFs: {len(self.lf_data)}" @@ -923,8 +923,8 @@ def _view_correct(self, L_labeled): if L_labeled.shape[0] == 0: indices = [] else: - agreed = self.dfs["labeled"]["label"].values == L_labeled - attempted = L_labeled != module_config.ABSTAIN_DECODED + agreed = DF.series_values(self.dfs["labeled"]["label"]) == L_labeled + attempted = L_labeled != ABSTAIN_DECODED indices = np.where(np.multiply(agreed, attempted))[0].tolist() view = CDSView(source=self.sources["labeled"], filters=[IndexFilter(indices)]) return view @@ -939,8 +939,8 @@ def _view_incorrect(self, L_labeled): if L_labeled.shape[0] == 0: indices = [] else: - disagreed = self.dfs["labeled"]["label"].values != L_labeled - attempted = L_labeled != module_config.ABSTAIN_DECODED + disagreed = DF.series_values(self.dfs["labeled"]["label"]) != L_labeled + attempted = L_labeled != ABSTAIN_DECODED indices = np.where(np.multiply(disagreed, attempted))[0].tolist() view = CDSView(source=self.sources["labeled"], filters=[IndexFilter(indices)]) return view @@ -956,8 +956,10 @@ def _view_missed(self, L_labeled, targets): if L_labeled.shape[0] == 0: indices = [] else: - targetable = np.isin(self.dfs["labeled"]["label"], targets) - abstained = L_labeled == module_config.ABSTAIN_DECODED + targetable = np.isin( + DF.series_values(self.dfs["labeled"]["label"]), targets + ) + abstained = L_labeled == ABSTAIN_DECODED indices = np.where(np.multiply(targetable, abstained))[0].tolist() view = CDSView(source=self.sources["labeled"], filters=[IndexFilter(indices)]) return view @@ -972,6 +974,6 @@ def _view_hit(self, L_raw): if L_raw.shape[0] == 0: indices = [] else: - indices = np.where(L_raw != module_config.ABSTAIN_DECODED)[0].tolist() + indices = np.where(L_raw != ABSTAIN_DECODED)[0].tolist() view = CDSView(source=self.sources["raw"], filters=[IndexFilter(indices)]) return view diff --git a/hover/core/explorer/local_config.py b/hover/core/explorer/local_config.py index 16e7c7d1..50b639df 100644 --- a/hover/core/explorer/local_config.py +++ b/hover/core/explorer/local_config.py @@ -1,5 +1,11 @@ import hover +from hover.config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) -SOURCE_COLOR_FIELD = hover.config["data.columns"]["source_color_field"] -SOURCE_ALPHA_FIELD = hover.config["data.columns"]["source_alpha_field"] -SEARCH_SCORE_FIELD = hover.config["data.columns"]["search_score_field"] +SOURCE_COLOR_FIELD = hover.config[Section.DATA_COLUMNS][Key.SOURCE_COLOR_FIELD] +SOURCE_ALPHA_FIELD = hover.config[Section.DATA_COLUMNS][Key.SOURCE_ALPHA_FIELD] +SEARCH_SCORE_FIELD = hover.config[Section.DATA_COLUMNS][Key.SEARCH_SCORE_FIELD] + +TOOLTIP_IMG_STYLE = hover.config[Section.VISUAL][Key.TOOLTIP_IMG_STYLE] diff --git a/hover/core/explorer/specialization.py b/hover/core/explorer/specialization.py index f2c67878..63ec0812 100644 --- a/hover/core/explorer/specialization.py +++ b/hover/core/explorer/specialization.py @@ -19,7 +19,7 @@ class BokehTextFinder(BokehDataFinder, BokehForText): """ TOOLTIP_KWARGS = BokehForText.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForText.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForText.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataFinder.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -42,7 +42,7 @@ class BokehTextAnnotator(BokehDataAnnotator, BokehForText): """ TOOLTIP_KWARGS = BokehForText.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForText.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForText.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataAnnotator.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -63,7 +63,7 @@ class BokehTextSoftLabel(BokehSoftLabelExplorer, BokehForText): """ TOOLTIP_KWARGS = BokehForText.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForText.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForText.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSoftLabelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -84,7 +84,7 @@ class BokehTextMargin(BokehMarginExplorer, BokehForText): """ TOOLTIP_KWARGS = BokehForText.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForText.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForText.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehMarginExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -104,7 +104,7 @@ class BokehTextSnorkel(BokehSnorkelExplorer, BokehForText): """ TOOLTIP_KWARGS = BokehForText.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForText.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForText.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSnorkelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -125,7 +125,7 @@ class BokehAudioFinder(BokehDataFinder, BokehForAudio): """ TOOLTIP_KWARGS = BokehForAudio.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForAudio.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForAudio.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataFinder.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -148,7 +148,7 @@ class BokehAudioAnnotator(BokehDataAnnotator, BokehForAudio): """ TOOLTIP_KWARGS = BokehForAudio.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForAudio.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForAudio.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataAnnotator.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -169,7 +169,7 @@ class BokehAudioSoftLabel(BokehSoftLabelExplorer, BokehForAudio): """ TOOLTIP_KWARGS = BokehForAudio.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForAudio.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForAudio.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSoftLabelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -190,7 +190,7 @@ class BokehAudioMargin(BokehMarginExplorer, BokehForAudio): """ TOOLTIP_KWARGS = BokehForAudio.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForAudio.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForAudio.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehMarginExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -210,7 +210,7 @@ class BokehAudioSnorkel(BokehSnorkelExplorer, BokehForAudio): """ TOOLTIP_KWARGS = BokehForAudio.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForAudio.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForAudio.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSnorkelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -231,7 +231,7 @@ class BokehImageFinder(BokehDataFinder, BokehForImage): """ TOOLTIP_KWARGS = BokehForImage.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForImage.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForImage.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataFinder.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -254,7 +254,7 @@ class BokehImageAnnotator(BokehDataAnnotator, BokehForImage): """ TOOLTIP_KWARGS = BokehForImage.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForImage.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForImage.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehDataAnnotator.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -275,7 +275,7 @@ class BokehImageSoftLabel(BokehSoftLabelExplorer, BokehForImage): """ TOOLTIP_KWARGS = BokehForImage.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForImage.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForImage.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSoftLabelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -296,7 +296,7 @@ class BokehImageMargin(BokehMarginExplorer, BokehForImage): """ TOOLTIP_KWARGS = BokehForImage.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForImage.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForImage.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehMarginExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): @@ -316,7 +316,7 @@ class BokehImageSnorkel(BokehSnorkelExplorer, BokehForImage): """ TOOLTIP_KWARGS = BokehForImage.TOOLTIP_KWARGS - MANDATORY_COLUMNS = BokehForImage.MANDATORY_COLUMNS + MANDATORY_COLUMN_TO_TYPE_DEFAULT = BokehForImage.MANDATORY_COLUMN_TO_TYPE_DEFAULT SUBSET_GLYPH_KWARGS = BokehSnorkelExplorer.SUBSET_GLYPH_KWARGS def _layout_widgets(self): diff --git a/hover/core/local_config.py b/hover/core/local_config.py index 49a56ddb..0787ad61 100644 --- a/hover/core/local_config.py +++ b/hover/core/local_config.py @@ -1,5 +1,9 @@ import re import hover +from hover.config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) from bokeh.models import ( Div, TableColumn, @@ -8,7 +12,10 @@ ) -DATASET_SUBSET_FIELD = hover.config["data.columns"]["dataset_subset_field"] +DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][ + Key.DEFAULT_REDUCTION_METHOD +] +DATASET_SUBSET_FIELD = hover.config[Section.DATA_COLUMNS][Key.DATASET_SUBSET_FIELD] COLOR_GLYPH_TEMPLATE = """

@@ -16,7 +23,7 @@

""" -EMBEDDING_FIELD_PREFIX = hover.config["data.columns"]["embedding_field_prefix"] +EMBEDDING_FIELD_PREFIX = hover.config[Section.DATA_COLUMNS][Key.EMBEDDING_FIELD_PREFIX] EMBEDDING_FIELD_REGEX = r"\d+d_\d+$" @@ -53,7 +60,7 @@ def dataset_default_sel_table_columns(feature_key): template="""<%= value %>""" ) elif feature_key == "image": - style = hover.config["visual"]["table_img_style"] + style = hover.config[Section.VISUAL][Key.TABLE_IMG_STYLE] # width is easily adjustable on the UI, no need to make configurable here feature_col_kwargs["width"] = 200 feature_col_kwargs["formatter"] = HTMLTemplateFormatter( diff --git a/hover/core/neural.py b/hover/core/neural.py index 0eafbe83..44e5bbfa 100644 --- a/hover/core/neural.py +++ b/hover/core/neural.py @@ -4,7 +4,6 @@ `torch`-based template classes for implementing neural nets that work the most smoothly with `hover`. """ import os -import hover import numpy as np import torch import torch.nn.functional as F @@ -15,6 +14,7 @@ from hover.core import Loggable from hover.utils.metrics import classification_accuracy from hover.utils.misc import current_time +from .local_config import DEFAULT_REDUCTION_METHOD class BaseVectorNet(Loggable): @@ -325,7 +325,11 @@ def predict_proba(self, inps): return probs def manifold_trajectory( - self, inps, method=None, reducer_kwargs=None, spline_kwargs=None + self, + inps, + method=DEFAULT_REDUCTION_METHOD, + reducer_kwargs=None, + spline_kwargs=None, ): """ ???+ note "Compute a propagation trajectory of the dataset manifold through the neural net." @@ -346,9 +350,6 @@ def manifold_trajectory( from hover.core.representation.manifold import LayerwiseManifold from hover.core.representation.trajectory import manifold_spline - if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] - reducer_kwargs = reducer_kwargs or {} spline_kwargs = spline_kwargs or {} diff --git a/hover/core/representation/local_config.py b/hover/core/representation/local_config.py index 15ba1b5f..93be1e0b 100644 --- a/hover/core/representation/local_config.py +++ b/hover/core/representation/local_config.py @@ -1,6 +1,16 @@ +import hover +from hover.config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) + KWARG_TRANSLATOR = { "dimension": { "umap": "n_components", "ivis": "embedding_dims", }, } + +DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][ + Key.DEFAULT_REDUCTION_METHOD +] diff --git a/hover/core/representation/manifold.py b/hover/core/representation/manifold.py index a08f1bdf..cea3462c 100644 --- a/hover/core/representation/manifold.py +++ b/hover/core/representation/manifold.py @@ -2,11 +2,11 @@ Manifold similarity measures for any collection of sequences of vectors. Can be useful for improved interpretability of neural nets. """ -import hover from tqdm import tqdm from scipy.spatial import procrustes from hover.core import Loggable from .reduction import DimensionalityReducer +from .local_config import DEFAULT_REDUCTION_METHOD class LayerwiseManifold(Loggable): @@ -64,7 +64,7 @@ def unfold(self, method=None, **kwargs): :type method: str """ if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] + method = DEFAULT_REDUCTION_METHOD # default kwargs should fix random state and seed # so that randomness does not introduce disparity diff --git a/hover/core/representation/reduction.py b/hover/core/representation/reduction.py index 7d1655fd..11909312 100644 --- a/hover/core/representation/reduction.py +++ b/hover/core/representation/reduction.py @@ -5,10 +5,9 @@ Icing on the cake: unify the syntax across different kinds of reducers. """ -import hover import numpy as np from hover.core import Loggable -from .local_config import KWARG_TRANSLATOR +from .local_config import KWARG_TRANSLATOR, DEFAULT_REDUCTION_METHOD class DimensionalityReducer(Loggable): @@ -22,7 +21,7 @@ def __init__(self, array): self.reference_array = array @staticmethod - def create_reducer(method=None, *args, **kwargs): + def create_reducer(method=DEFAULT_REDUCTION_METHOD, *args, **kwargs): """ ???+ note "Handle kwarg translation and dynamic imports." @@ -32,9 +31,6 @@ def create_reducer(method=None, *args, **kwargs): | `*args` | | forwarded to the reducer | | `**kwargs` | | translated and forwarded | """ - if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] - if method == "umap": import umap @@ -57,7 +53,7 @@ def create_reducer(method=None, *args, **kwargs): reducer = reducer_cls(*args, **translated_kwargs) return reducer - def fit_transform(self, method=None, *args, **kwargs): + def fit_transform(self, method=DEFAULT_REDUCTION_METHOD, *args, **kwargs): """ ???+ note "Fit and transform an array and store the reducer." | Param | Type | Description | @@ -66,15 +62,12 @@ def fit_transform(self, method=None, *args, **kwargs): | `*args` | | forwarded to the reducer | | `**kwargs` | | forwarded to the reducer | """ - if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] - reducer = DimensionalityReducer.create_reducer(method=method, *args, **kwargs) embedding = reducer.fit_transform(self.reference_array) setattr(self, method, reducer) return embedding - def transform(self, array, method=None): + def transform(self, array, method=DEFAULT_REDUCTION_METHOD): """ ???+ note "Transform an array with a already-fitted reducer." | Param | Type | Description | @@ -82,9 +75,6 @@ def transform(self, array, method=None): | `array` | `np.ndarray` | the array to transform | | `method` | `str` | `"umap"` or `"ivis"` | """ - if method is None: - method = hover.config["data.embedding"]["default_reduction_method"] - assert isinstance(array, np.ndarray), f"Expected np.ndarray, got {type(array)}" # edge case: array is too small if array.shape[0] < 1: diff --git a/hover/module_config.py b/hover/module_config.py index dff6c60b..6cc32e17 100644 --- a/hover/module_config.py +++ b/hover/module_config.py @@ -1,12 +1,27 @@ import hover +from .config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) +from .utils.dataframe import ( + PandasDataframe, + PolarsDataframe, +) + +# dataframe implementation +DataFrame = ( + PandasDataframe + if hover.config[Section.BACKEND][Key.DATAFRAME_LIBRARY].lower() == "pandas" + else PolarsDataframe +) # constants for the abstain mechanism -ABSTAIN_DECODED = hover.config["data.values"]["abstain_decoded"] -ABSTAIN_ENCODED = hover.config["data.values"]["abstain_encoded"] -ABSTAIN_HEXCOLOR = hover.config["visual"]["abstain_hexcolor"] +ABSTAIN_DECODED = hover.config[Section.DATA_VALUES][Key.ABSTAIN_DECODED] +ABSTAIN_ENCODED = hover.config[Section.DATA_VALUES][Key.ABSTAIN_ENCODED] +ABSTAIN_HEXCOLOR = hover.config[Section.VISUAL][Key.ABSTAIN_HEXCOLOR] # constants for label encoding mechanism -ENCODED_LABEL_KEY = hover.config["data.columns"]["encoded_label_key"] +ENCODED_LABEL_KEY = hover.config[Section.DATA_COLUMNS][Key.ENCODED_LABEL_KEY] # constants for saving work -DATA_SAVE_DIR = hover.config["io"]["data_save_dir"] +DATA_SAVE_DIR = hover.config[Section.IO][Key.DATA_SAVE_DIR] diff --git a/hover/recipes/local_config.py b/hover/recipes/local_config.py new file mode 100644 index 00000000..7e3b5eb7 --- /dev/null +++ b/hover/recipes/local_config.py @@ -0,0 +1,9 @@ +import hover +from hover.config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) + +DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][ + Key.DEFAULT_REDUCTION_METHOD +] diff --git a/hover/recipes/subroutine.py b/hover/recipes/subroutine.py index 1f88b555..30213503 100644 --- a/hover/recipes/subroutine.py +++ b/hover/recipes/subroutine.py @@ -7,11 +7,12 @@ """ import re import numpy as np -import hover import hover.core.explorer as hovex +from hover.module_config import DataFrame as DF from bokeh.layouts import row, column from bokeh.models import Button from rich.console import Console +from .local_config import DEFAULT_REDUCTION_METHOD EXPLORER_CATALOG = { @@ -112,7 +113,7 @@ def standard_annotator(dataset, **kwargs): annotator.activate_search() annotator.plot() - # subscribe for df updates + # subscribe for dataset updates dataset.subscribe_update_push(annotator, {_k: _k for _k in subsets}) # annotators can commit to a dataset @@ -152,7 +153,7 @@ def standard_finder(dataset, **kwargs): finder.activate_search() finder.plot() - # subscribe for df updates + # subscribe for dataset updates dataset.subscribe_update_push(finder, {_k: _k for _k in subsets}) return finder @@ -269,7 +270,7 @@ def retrain_vecnet(): vecnet.auto_adjust_setup(dataset.classes) train_loader = vecnet.prepare_loader(dataset, "train", smoothing_coeff=0.2) - if dataset.dfs["dev"].shape[0] > 0: + if dataset.subset("dev").shape[0] > 0: dev_loader = vecnet.prepare_loader(dataset, "dev") else: dataset._warn("dev set is empty, borrowing train set for validation.") @@ -287,33 +288,33 @@ def update_softlabel_plot(): use_subsets = ("raw", "train", "dev") inps = [] for _key in use_subsets: - inps.extend(dataset.dfs[_key][feature_key].tolist()) + inps.extend(DF.series_tolist(dataset.subset(_key)[feature_key])) probs = vecnet.predict_proba(inps) labels = [dataset.label_decoder[_val] for _val in probs.argmax(axis=-1)] scores = probs.max(axis=-1).tolist() traj_arr, _, _ = vecnet.manifold_trajectory( inps, - method=hover.config["data.embedding"]["default_reduction_method"], + method=DEFAULT_REDUCTION_METHOD, reducer_kwargs=dict(dimension=manifold_dim), spline_kwargs=dict(points_per_step=5), ) offset = 0 for _key in use_subsets: - _length = dataset.dfs[_key].shape[0] + _length = dataset.subset(_key).shape[0] # skip subset if empty if _length == 0: continue _slice = slice(offset, offset + _length) - dataset.dfs[_key]["pred_label"] = labels[_slice] - dataset.dfs[_key]["pred_score"] = scores[_slice] + dataset.subset(_key).set_column_by_array("pred_label", labels[_slice]) + dataset.subset(_key).set_column_by_array("pred_score", scores[_slice]) for i, _col in enumerate(manifold_traj_cols): # all steps, selected slice _traj = traj_arr[:, _slice, i] # selected slice, all steps _traj = list(np.swapaxes(_traj, 0, 1)) - dataset.dfs[_key][f"{_col}_traj"] = _traj + dataset.subset(_key).set_column_by_array(f"{_col}_traj", _traj) offset += _length diff --git a/hover/utils/bokeh_helper/__init__.py b/hover/utils/bokeh_helper/__init__.py index b2c2b497..c688888f 100644 --- a/hover/utils/bokeh_helper/__init__.py +++ b/hover/utils/bokeh_helper/__init__.py @@ -2,14 +2,13 @@ ???+ note "Useful subroutines for working with bokeh in general." """ import os -import hover import numpy as np from functools import wraps from traceback import format_exc from urllib.parse import urljoin, urlparse from bokeh.models import PreText from bokeh.layouts import column -from hover import module_config +from hover.module_config import ABSTAIN_DECODED, ABSTAIN_HEXCOLOR from .local_config import ( TOOLTIP_TEXT_TEMPLATE, TOOLTIP_IMAGE_TEMPLATE, @@ -18,6 +17,8 @@ TOOLTIP_LABEL_TEMPLATE, TOOLTIP_COORDS_DIV, TOOLTIP_INDEX_DIV, + BOKEH_PALETTE_USAGE, + BOKEH_PALETTE, ) @@ -26,11 +27,10 @@ def auto_label_color(labels): ???+ note "Create a label->hex color mapping dict." """ use_labels = set(labels) - use_labels.discard(module_config.ABSTAIN_DECODED) + use_labels.discard(ABSTAIN_DECODED) use_labels = sorted(use_labels, reverse=False) - palette = hover.config["visual"]["bokeh_palette"] - usage = hover.config["visual"]["bokeh_palette_usage"] + palette, usage = BOKEH_PALETTE, BOKEH_PALETTE_USAGE nlabels, ncolors = len(use_labels), len(palette) assert nlabels <= ncolors, f"Too many labels to support (max at {len(palette)})" @@ -50,7 +50,7 @@ def auto_label_color(labels): use_palette = [palette[i] for i in use_palette_idx] color_dict = { - module_config.ABSTAIN_DECODED: module_config.ABSTAIN_HEXCOLOR, + ABSTAIN_DECODED: ABSTAIN_HEXCOLOR, **{_l: _c for _l, _c in zip(use_labels, use_palette)}, } return color_dict diff --git a/hover/utils/bokeh_helper/local_config.py b/hover/utils/bokeh_helper/local_config.py index 78fb5991..3d622034 100644 --- a/hover/utils/bokeh_helper/local_config.py +++ b/hover/utils/bokeh_helper/local_config.py @@ -1,3 +1,13 @@ +import hover +from hover.config_constants import ( + ConfigSection as Section, + ConfigKey as Key, +) + + +BOKEH_PALETTE = hover.config[Section.VISUAL][Key.BOKEH_PALETTE] +BOKEH_PALETTE_USAGE = hover.config[Section.VISUAL][Key.BOKEH_PALETTE_USAGE] + TOOLTIP_TEXT_TEMPLATE = """
diff --git a/hover/utils/dataframe.py b/hover/utils/dataframe.py new file mode 100644 index 00000000..0c77af07 --- /dev/null +++ b/hover/utils/dataframe.py @@ -0,0 +1,590 @@ +""" +Dataframe-specific operations. +This module is intended to capture pandas/polars logic. +""" + +import numpy as np +import pandas as pd +import polars as pl +import warnings +from abc import ABC, abstractmethod +from collections import Counter, OrderedDict +from functools import wraps + + +TYPE_TO_POLARS = { + int: pl.Int64, + float: pl.Float64, + str: pl.Utf8, + bool: pl.Boolean, + np.int64: pl.Int64, + np.float64: pl.Float64, + pd.Int64Dtype: pl.Int64, + pd.Float64Dtype: pl.Float64, + pd.StringDtype: pl.Utf8, + pd.BooleanDtype: pl.Boolean, +} + + +def sametype(func): + @wraps(func) + def wrapper(obj, *args, **kwargs): + value = func(obj, *args, **kwargs) + if not isinstance(value, obj.__class__): + value = obj.__class__(value) + return value + + return wrapper + + +def convert_indices_to_list(indices, size): + if isinstance(indices, list): + return indices + elif isinstance(indices, np.ndarray): + return indices.astype(int).tolist() + elif isinstance(indices, slice): + assert isinstance( + size, int + ), f"size must be provided for slice indices, got {size}." + return list(range(*indices.indices(size))) + else: + try: + return list(indices) + except Exception: + raise NotImplementedError(f"Indices type {type(indices)} is not supported.") + + +def array_length_check(array, target_length): + if hasattr(array, "__len__"): + assert ( + len(array) == target_length + ), f"length mismatch: {len(array)} != {target_length}" + if hasattr(array, "shape"): + assert ( + array.shape[0] == target_length + ), f"length mismatch: {array.shape[0]} != {target_length}" + + +class AbstractDataframe(ABC): + """ + ???+ note "An abstract class for hover-specific dataframe operations." + """ + + DF_TYPE = pd.DataFrame + + def __init__(self, df): + assert isinstance(df, self.DF_TYPE), f"df must be of type {self.DF_TYPE}" + self._df = df + + def __call__(self): + return self._df + + @classmethod + def construct(cls, *args, **kwargs): + df = cls.DF_TYPE(*args, **kwargs) + return cls(df) + + def __getitem__(self, key): + if isinstance(key, int) or isinstance(key, str): + return self._df[key] + elif hasattr(key, "__iter__"): + if isinstance(key[0], str): + return self.select_columns(key) + elif isinstance(key[0], int): + return self.select_rows(key) + else: + raise NotImplementedError(f"key {key} is not supported.") + elif isinstance(key, slice): + return self.select_rows(range(*key.indices(self.shape[0]))) + else: + raise NotImplementedError(f"key {key} is not supported.") + + @property + def columns(self): + return self._df.columns + + @columns.setter + def columns(self): + raise NotImplementedError("setting columns is forbidden.") + + @property + def shape(self): + return self._df.shape + + def column_counter(self, column): + return Counter(self.__class__.series_tolist(self._df[column])) + + def select_columns(self, columns): + return self.__class__(self._df[columns]) + + @classmethod + def empty_with_columns(cls, column_to_default): + raise NotImplementedError + + @classmethod + def concat_rows(cls, df_list): + raise NotImplementedError + + @classmethod + def series_values(cls, series): + raise NotImplementedError + + @classmethod + def series_tolist(cls, series): + raise NotImplementedError + + @classmethod + def series_to(cls, series, output): + raise NotImplementedError + + @abstractmethod + def copy(self): + raise NotImplementedError + + @abstractmethod + def to_pandas(self): + raise NotImplementedError + + @abstractmethod + def to_dict_of_lists(self): + raise NotImplementedError + + @abstractmethod + def to_list_of_dicts(self): + raise NotImplementedError + + @abstractmethod + def get_row_as_dict(self, index): + raise NotImplementedError + + @abstractmethod + def select_rows(self, indices): + raise NotImplementedError + + @abstractmethod + def filter_rows_by_operator(self, column, operator, value): + raise NotImplementedError + + @abstractmethod + def unique(self, subset, keep): + raise NotImplementedError + + @abstractmethod + def set_column_by_constant(self, column, value, indices=None): + raise NotImplementedError + + @abstractmethod + def set_column_by_array(self, column, values, indices=None): + raise NotImplementedError + + @abstractmethod + def column_map(self, column, mapping, indices=None, as_column=None, output="numpy"): + raise NotImplementedError + + @abstractmethod + def column_isin(self, column, lookup, indices=None, as_column=None, output="numpy"): + raise NotImplementedError + + @abstractmethod + def column_apply( + self, column, function, indices=None, as_column=None, output="numpy" + ): + raise NotImplementedError + + @abstractmethod + def row_apply(self, function, indices=None, as_column=None, output="numpy"): + raise NotImplementedError + + @abstractmethod + def get_cell_by_row_column(self, row_idx, column_name): + raise NotImplementedError + + @abstractmethod + def set_cell_by_row_column(self, row_idx, column_name, value): + raise NotImplementedError + + +class PandasDataframe(AbstractDataframe): + """ + ???+ note "A class for hover-specific pandas dataframe operations." + """ + + DF_TYPE = pd.DataFrame + + @classmethod + def empty_with_columns(cls, column_to_type): + return cls(pd.DataFrame(columns=column_to_type.keys())) + + @classmethod + def concat_rows(cls, df_list): + for _df in df_list: + assert isinstance(_df, cls), f"df must be of type {cls}" + pd_list = [df() for df in df_list] + return cls(pd.concat(pd_list, axis=0, sort=False, ignore_index=True)) + + @classmethod + def series_values(cls, series): + return series.values + + @classmethod + def series_tolist(cls, series): + return series.tolist() + + @classmethod + def series_to(cls, series, output): + if output == "numpy": + return series.values + elif output == "list": + return series.tolist() + elif output == "series": + return series + else: + raise ValueError( + f"output must be 'numpy', 'list', or 'series', got {output}" + ) + + @sametype + def copy(self): + return self._df.copy() + + def to_pandas(self): + return self._df.copy() + + def to_dict_of_lists(self): + return self._df.to_dict(orient="list") + + def to_list_of_dicts(self): + return self._df.to_dict(orient="records") + + def get_row_as_dict(self, index): + assert isinstance(index, int), f"index must be int, not {type(index)}" + return self._df.iloc[index].to_dict() + + @sametype + def select_rows(self, indices): + if indices is None: + return self + indices = convert_indices_to_list(indices, self.shape[0]) + if len(indices) == 0: + return pd.DataFrame(columns=self.columns) + return self._df.iloc[indices] + + @sametype + def filter_rows_by_operator(self, column, operator, value): + mask = operator(self._df[column], value) + return self._df[mask].reset_index(drop=True) + + @sametype + def unique(self, subset, keep): + return self._df.drop_duplicates(subset, keep=keep).reset_index(drop=True) + + def set_column_by_constant(self, column, value, indices=None): + assert np.isscalar(value), f"value must be scalar, not {type(value)}" + + if indices is None: + self._df[column] = value + else: + # use conversion to avoid pandas loc taking inclusive slice + indices = convert_indices_to_list(indices, self.shape[0]) + self._df.loc[indices, column] = value + + def set_column_by_array(self, column, values, indices=None): + assert not np.isscalar(values), f"values must be array-like, not {type(values)}" + if indices is None: + target_length = self._df.shape[0] + else: + # use conversion to avoid pandas loc taking inclusive slice + indices = convert_indices_to_list(indices, self.shape[0]) + target_length = len(indices) + + array_length_check(values, target_length) + + if indices is None: + self._df[column] = values + else: + # use conversion to avoid pandas loc taking inclusive slice + indices = convert_indices_to_list(indices, self.shape[0]) + self._df.loc[indices, column] = values + + def _pre_apply(self, indices, as_column): + if indices is None: + if as_column is not None: + assert isinstance( + as_column, str + ), f"as_column must be str, got {type(as_column)}" + return self._df, as_column + else: + assert ( + as_column is None + ), f"as_column must be None when indices are specifed, got {as_column}" + # unlike loc, iloc needs no conversion + return self._df.iloc[indices], None + + def _post_apply(self, series, as_column, output): + if as_column is None: + return self.__class__.series_to(series, output) + else: + self._df[as_column] = series + return + + def column_map(self, column, mapping, indices=None, as_column=None, output="numpy"): + subject, as_column = self._pre_apply(indices, as_column) + series = subject[column].map(mapping) + return self._post_apply(series, as_column, output) + + def column_isin(self, column, lookup, indices=None, as_column=None, output="numpy"): + subject, as_column = self._pre_apply(indices, as_column) + series = subject[column].isin(lookup) + return self._post_apply(series, as_column, output) + + def column_apply( + self, column, function, indices=None, as_column=None, output="numpy" + ): + subject, as_column = self._pre_apply(indices, as_column) + series = subject[column].apply(function) + return self._post_apply(series, as_column, output) + + def row_apply(self, function, indices=None, as_column=None, output="numpy"): + subject, as_column = self._pre_apply(indices, as_column) + series = subject.apply(function, axis=1) + return self._post_apply(series, as_column, output) + + def get_cell_by_row_column(self, row_idx, column_name): + return self._df.at[row_idx, column_name] + + def set_cell_by_row_column(self, row_idx, column_name, value): + self._df.at[row_idx, column_name] = value + + +class PolarsDataframe(AbstractDataframe): + """ + ???+ note "A class for hover-specific polars dataframe operations." + """ + + DF_TYPE = pl.DataFrame + + @classmethod + def empty_with_columns(cls, column_to_type): + return cls( + pl.DataFrame( + schema={ + col: TYPE_TO_POLARS[_type] for col, _type in column_to_type.items() + }, + ) + ) + + @classmethod + def concat_rows(cls, df_list): + schema = None + pl_list = [] + + # basic type, length, and schema checks; get the union'ed schema + for _df in df_list: + assert isinstance(_df, cls), f"df must be of type {cls}" + _pl = _df() + if _pl.shape[0] == 0: + continue + if schema is None: + schema = OrderedDict(_pl.schema) + else: + for col, dtype in _pl.schema.items(): + assert ( + schema.get(col, dtype) == dtype + ), f"all dataframes must have consistent schema, got {schema} and {_pl.schema}" + schema.update(_pl.schema) + pl_list.append(_pl) + + assert schema is not None, "all dataframes were empty" + return cls(pl.concat(pl_list, how="diagonal")) + + @classmethod + def series_values(cls, series): + return series.to_numpy() + + @classmethod + def series_tolist(cls, series): + return series.to_list() + + @classmethod + def series_to(cls, series, output): + if output == "numpy": + return series.to_numpy() + elif output == "list": + return series.to_list() + elif output == "series": + return series + else: + raise ValueError( + f"output must be 'numpy', 'list', or 'series', got {output}" + ) + + @sametype + def copy(self): + return self._df.clone() + + def to_pandas(self): + return self._df.to_pandas() + + def to_dict_of_lists(self): + return self._df.to_dict(as_series=False) + + def to_list_of_dicts(self): + return self._df.to_dicts() + + def get_row_as_dict(self, index): + assert isinstance(index, int), f"index must be int, not {type(index)}" + return self._df.row(index, named=True) + + @sametype + def select_rows(self, indices): + indices = convert_indices_to_list(indices, size=self._df.shape[0]) + if len(indices) == 0: + return self._df.head(0) + return self._df[indices] + + @sametype + def filter_rows_by_operator(self, column, operator, value): + mask = self.__class__.series_values(operator(self._df[column], value)) + indices = np.where(mask)[0] + return self.select_rows(indices) + + @sametype + def unique(self, subset, keep): + return self._df.unique(subset, keep=keep, maintain_order=True) + + def set_column_by_constant(self, column, value, indices=None): + if indices is None: + self._df = self._df.with_columns(pl.lit(value).alias(column)) + else: + # handle slice / array and convert to lookup + indices = set(convert_indices_to_list(indices, size=self._df.shape[0])) + + # create a temporary index column for predicating + tmp_index_col = "index" + while tmp_index_col in self._df.columns: + tmp_index_col += "_" + tmp_df = self._df.with_columns( + pl.arange(0, self._df.shape[0]).alias(tmp_index_col) + ) + + self._df = tmp_df.with_columns( + pl.when(pl.col(tmp_index_col).is_in(indices)) + .then(pl.lit(value)) + .otherwise(pl.col(column)) + .alias(column) + ).drop(tmp_index_col) + + def set_column_by_array(self, column, values, indices=None): + if indices is None: + self._df = self._df.with_columns(pl.Series(values).alias(column)) + else: + indices = convert_indices_to_list(indices, size=self._df.shape[0]) + array_length_check(values, len(indices)) + lookup = dict(zip(indices, values)) + patch = pl.DataFrame( + { + column: [lookup.get(i, None) for i in range(self._df.shape[0])], + } + ) + self._df = self._df.update(patch) + + def _pre_apply(self, indices, as_column): + # determine the column name for the result + if as_column is None: + col_name = "result" + while col_name in self._df.columns: + col_name += "_" + else: + assert isinstance( + as_column, str + ), f"as_column must be str, got {type(as_column)}" + assert ( + indices is None + ), f"as_column must be None when indices are specifed, got {as_column}" + col_name = as_column + + # determine the subject of the apply + subject = self._df if indices is None else self.select_rows(indices)() + + return subject, col_name + + def _post_apply(self, series, as_column, output): + if as_column is None: + return self.__class__.series_to(series, output) + else: + self._df = self._df.with_columns(series.alias(as_column)) + return + + def _get_return_type(self, value): + original_type = type(value) + if original_type not in TYPE_TO_POLARS: + raise TypeError(f"Unsupported return type: {original_type} for {value}") + return TYPE_TO_POLARS[original_type] + + def column_map(self, column, mapping, indices=None, as_column=None, output="numpy"): + subject, _ = self._pre_apply(indices, as_column) + example_value = list(mapping.values())[0] + dtype = self._get_return_type(example_value) + if self.shape[0] > 0: + series = subject[column].map_dict(mapping, return_dtype=dtype) + else: + series = pl.Series([], dtype=dtype) + return self._post_apply(series, as_column, output) + + def column_isin(self, column, lookup, indices=None, as_column=None, output="numpy"): + subject, _ = self._pre_apply(indices, as_column) + series = subject[column].is_in(lookup) + return self._post_apply(series, as_column, output) + + def column_apply( + self, column, function, indices=None, as_column=None, output="numpy" + ): + subject, _ = self._pre_apply(indices, as_column) + if self.shape[0] > 0: + example_value = function(self.get_cell_by_row_column(0, column)) + dtype = self._get_return_type(example_value) + series = subject[column].apply(function, return_dtype=dtype) + else: + series = pl.Series([]) + return self._post_apply(series, as_column, output) + + def row_apply(self, function, indices=None, as_column=None, output="numpy"): + # determine the return type for df.apply + if self.shape[0] > 0: + example_value = function(self._df.row(0, named=True)) + dtype = self._get_return_type(example_value) + else: + dtype = None + + subject, col = self._pre_apply(indices, as_column) + + # handle empty subject + if subject.shape[0] == 0: + if as_column is None: + return self.__class__.series_to(pl.Series([]), output) + else: + self._df = self._df.with_columns(pl.Series([]).alias(as_column)) + return + + # create the function to be applied + to_apply = ( + pl.struct(self._df.columns).apply(function, return_dtype=dtype).alias(col) + ) + # apply the function + if as_column is None: + series = subject.with_columns(to_apply)[col] + return self.__class__.series_to(series, output) + else: + assert subject is self._df, "subject must be self._df" + self._df = subject.with_columns(to_apply) + return + + def get_cell_by_row_column(self, row_idx, column_name): + return self._df.row(row_idx, named=True)[column_name] + + def set_cell_by_row_column(self, row_idx, column_name, value): + if isinstance(value, (list, tuple, np.ndarray, pd.Series)): + warnings.warn( + "Setting a single cell with a list-like object may not yet be supported by polars." + ) + self._df[row_idx, column_name] = value diff --git a/hover/utils/typecheck.py b/hover/utils/typecheck.py new file mode 100644 index 00000000..ee18294c --- /dev/null +++ b/hover/utils/typecheck.py @@ -0,0 +1,21 @@ +class TypedValueDict(dict): + """ + A dict that only allows values of a certain type. + """ + + def __init__(self, type_, *args, **kwargs): + self._type = type_ + super().__init__(*args, **kwargs) + + def __setitem__(self, key, value): + self.typecheck(value) + super().__setitem__(key, value) + + def typecheck(self, value): + if not isinstance(value, self._type): + raise TypeError(f"Value must be of type {self._type}, got {type(value)}") + + def update(self, other): + for _value in other.values(): + self.typecheck(_value) + super().update(other) diff --git a/setup.py b/setup.py index 9d643c0d..dced219e 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,8 @@ def get_description(): "torch>=1.10.0", # data handling "pandas>=1.3.0", + "polars>=0.17.0", + "pyarrow>=11.0.0", "numpy>=1.22", # computations "scipy>=1.3.2", @@ -41,7 +43,7 @@ def get_description(): # dimensionality reduction: UMAP is included "umap-learn>=0.3.10", # module config customization - "flexmod>=0.1.0", + "flexmod>=0.1.2", # optional: more dimensionality reduction methods # "ivis[cpu]>=1.7", # optional: distant supervision diff --git a/tests/conftest.py b/tests/conftest.py index 59bb56db..154edd8a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import os import numpy as np import pandas as pd +import polars as pl from functools import lru_cache # configure hover @@ -90,12 +91,12 @@ def dummy_labeling_function_list(): @labeling_function(targets=["rec.autos"]) def auto_keywords(row): - flag = re.search(r"(wheel|diesel|gasoline|automobile|vehicle)", row.text) + flag = re.search(r"(wheel|diesel|gasoline|automobile|vehicle)", row["text"]) return "rec.autos" if flag else ABSTAIN_DECODED @labeling_function(targets=["rec.sport.baseball"]) def baseball_keywords(row): - flag = re.search(r"(baseball|stadium|\ bat\ |\ base\ )", row.text) + flag = re.search(r"(baseball|stadium|\ bat\ |\ base\ )", row["text"]) return "rec.sport.baseball" if flag else ABSTAIN_DECODED lf_list = [auto_keywords, baseball_keywords] @@ -104,7 +105,7 @@ def baseball_keywords(row): @pytest.fixture(scope="module") -def generate_df_with_coords(): +def generate_pandas_df_with_coords(): import faker from hover.core.local_config import embedding_field @@ -129,45 +130,63 @@ def random_df_with_coords(size=300, embedding_dim=3): return random_df_with_coords +def wrap_pandas_df(pandas_df): + from hover.utils.dataframe import PandasDataframe, PolarsDataframe + from hover.module_config import DataFrame + + assert isinstance(pandas_df, pd.DataFrame), f"Unexpected type {type(pandas_df)}" + if DataFrame is PandasDataframe: + return DataFrame(pandas_df) + elif DataFrame is PolarsDataframe: + return DataFrame(pl.from_pandas(pandas_df)) + else: + raise ValueError(f"Unexpected DataFrame type {DataFrame}") + + @pytest.fixture(scope="module") -def example_raw_df(generate_df_with_coords): +def example_raw_pandas_df(generate_pandas_df_with_coords): from hover.module_config import ABSTAIN_DECODED - df = generate_df_with_coords(300) + df = generate_pandas_df_with_coords(300) df["label"] = ABSTAIN_DECODED return df @pytest.fixture(scope="module") -def example_soft_label_df(example_raw_df): - df = example_raw_df.copy() +def example_raw_df(example_raw_pandas_df): + return wrap_pandas_df(example_raw_pandas_df) + + +@pytest.fixture(scope="module") +def example_soft_label_df(example_raw_pandas_df): + df = example_raw_pandas_df.copy() df["pred_label"] = df.apply(RANDOM_LABEL, axis=1) df["pred_score"] = df.apply(RANDOM_SCORE, axis=1) - return df + return wrap_pandas_df(df) @pytest.fixture(scope="module") -def example_margin_df(example_raw_df): - df = example_raw_df.copy() +def example_margin_df(example_raw_pandas_df): + df = example_raw_pandas_df.copy() df["label_1"] = df.apply(RANDOM_LABEL, axis=1) df["label_2"] = df.apply(RANDOM_LABEL, axis=1) - return df + return wrap_pandas_df(df) @pytest.fixture(scope="module") -def example_labeled_df(generate_df_with_coords): - df = generate_df_with_coords(100) +def example_labeled_df(generate_pandas_df_with_coords): + df = generate_pandas_df_with_coords(100) df["label"] = df.apply(RANDOM_LABEL, axis=1) - return df + return wrap_pandas_df(df) @pytest.fixture(scope="module") -def example_everything_df(example_raw_df, generate_df_with_coords): +def example_everything_pandas_df(example_raw_pandas_df, generate_pandas_df_with_coords): from hover.core.local_config import DATASET_SUBSET_FIELD - raw_df = example_raw_df.copy() + raw_df = example_raw_pandas_df.copy() raw_df[DATASET_SUBSET_FIELD] = "raw" - labeled_df = generate_df_with_coords(200) + labeled_df = generate_pandas_df_with_coords(200) labeled_df["label"] = labeled_df.apply(RANDOM_LABEL, axis=1) labeled_df[DATASET_SUBSET_FIELD] = "train" labeled_df.loc[100:150, DATASET_SUBSET_FIELD] = "dev" @@ -186,40 +205,47 @@ def example_everything_df(example_raw_df, generate_df_with_coords): return df -def subroutine_dataset_with_vectorizer(df, dataset_cls, vectorizer): - dataset = dataset_cls.from_pandas(df) +@pytest.fixture(scope="module") +def example_everything_df(example_everything_pandas_df): + return wrap_pandas_df(example_everything_pandas_df) + + +def subroutine_dataset_with_vectorizer(pandas_df, dataset_cls, vectorizer): + dataset = dataset_cls.from_pandas(pandas_df) dataset.vectorizer_lookup[2] = vectorizer return dataset @pytest.fixture(scope="module") -def example_text_dataset(example_everything_df, dummy_vectorizer): +def example_text_dataset(example_everything_pandas_df, dummy_vectorizer): from hover.core.dataset import SupervisableTextDataset - return subroutine_dataset_with_vectorizer( - example_everything_df, + dataset = subroutine_dataset_with_vectorizer( + example_everything_pandas_df, SupervisableTextDataset, dummy_vectorizer, ) + return dataset + @pytest.fixture(scope="module") -def example_image_dataset(example_everything_df, dummy_vectorizer): +def example_image_dataset(example_everything_pandas_df, dummy_vectorizer): from hover.core.dataset import SupervisableImageDataset return subroutine_dataset_with_vectorizer( - example_everything_df, + example_everything_pandas_df, SupervisableImageDataset, dummy_vectorizer, ) @pytest.fixture(scope="module") -def example_audio_dataset(example_everything_df, dummy_vectorizer): +def example_audio_dataset(example_everything_pandas_df, dummy_vectorizer): from hover.core.dataset import SupervisableAudioDataset return subroutine_dataset_with_vectorizer( - example_everything_df, + example_everything_pandas_df, SupervisableAudioDataset, dummy_vectorizer, ) diff --git a/tests/core/explorer/local_helper.py b/tests/core/explorer/local_helper.py index bc2b32a8..bc816566 100644 --- a/tests/core/explorer/local_helper.py +++ b/tests/core/explorer/local_helper.py @@ -1,4 +1,4 @@ -from hover import module_config +from hover.module_config import DataFrame, ABSTAIN_DECODED from hover.core.explorer.local_config import SEARCH_SCORE_FIELD from hover.utils.snorkel_helper import labeling_function from bokeh.events import SelectionGeometry @@ -112,7 +112,7 @@ def subroutine_rules_from_text_df(df): """ Dummy rules for predictable outcome. """ - texts = df["text"].tolist() + texts = DataFrame.series_tolist(df["text"]) assert len(texts) >= 20, f"Expected at least 20 texts, got {len(texts)}" first_six_texts = set(texts[:6]) first_ten_texts = set(texts[:10]) @@ -120,7 +120,7 @@ def subroutine_rules_from_text_df(df): def subroutine_lookup(query, pool, label): if query in pool: return label - return module_config.ABSTAIN_DECODED + return ABSTAIN_DECODED @labeling_function(targets=["A"], name="narrow_rule_a") def narrow_rule_a_clone(row): diff --git a/tests/core/explorer/test_functionality.py b/tests/core/explorer/test_functionality.py index 11dcba82..47ba5cc9 100644 --- a/tests/core/explorer/test_functionality.py +++ b/tests/core/explorer/test_functionality.py @@ -3,7 +3,7 @@ For mechanisms that are invariant across `hover.core.explorer.feature`. """ -from hover import module_config +from hover.module_config import DataFrame, ABSTAIN_DECODED from hover.recipes.subroutine import get_explorer_class from bokeh.events import ButtonClick, MenuItemClick from .local_helper import ( @@ -61,10 +61,10 @@ def subroutine(df_dict, feature_type): assert annotator.sources["raw"].data == prev_source_data df_dict = { - "raw": example_raw_df.copy(), - "train": example_labeled_df.copy(), - "dev": example_labeled_df.copy(), - "test": example_labeled_df.copy(), + "raw": example_raw_df, + "train": example_labeled_df, + "dev": example_labeled_df, + "test": example_labeled_df, } # test four subset combinations of df_dicts @@ -98,7 +98,7 @@ def test_filter_text(example_raw_df): explorer.plot() # dynamically construct patterns with predictable outcome - texts = explorer.dfs["raw"]["text"].tolist() + texts = DataFrame.series_tolist(explorer.dfs["raw"]["text"]) first_token_of_ten = set() first_token_of_two = set() for i, _text in enumerate(texts): @@ -181,13 +181,10 @@ def test_labeling(example_raw_df): assert explorer.sources["raw"].selected.indices == _expected_selected # actual labeling - assert ( - explorer.dfs["raw"].loc[[0, 1], "label"] - == [module_config.ABSTAIN_DECODED] * 2 - ).all() + assert (explorer.dfs["raw"]["label"][[0, 1]] == [ABSTAIN_DECODED] * 2).all() explorer.annotator_input.value = "A" explorer.annotator_apply._trigger_event(apply_event) - assert (explorer.dfs["raw"].loc[[0, 1], "label"] == ["A"] * 2).all() + assert (explorer.dfs["raw"]["label"][[0, 1]] == ["A"] * 2).all() @pytest.mark.core @@ -340,7 +337,7 @@ def test_lf_labeling(example_raw_df, example_labeled_df): explorer.lf_filter_trigger._trigger_event(filter_event) explorer.lf_apply_trigger._trigger_event(apply_event) - first_six_labels = explorer.dfs["raw"]["label"].iloc[:6].tolist() + first_six_labels = DataFrame.series_tolist(explorer.dfs["raw"]["label"])[:6] assert first_six_labels == ["A"] * 6 # add more rules, check menu again @@ -368,7 +365,7 @@ def test_lf_labeling(example_raw_df, example_labeled_df): _event = MenuItemClick(explorer.lf_apply_trigger, item="narrow_rule_b") explorer.lf_apply_trigger._trigger_event(_event) - first_six_labels = explorer.dfs["raw"]["label"].iloc[:6].tolist() + first_six_labels = DataFrame.series_tolist(explorer.dfs["raw"]["label"])[:6] assert first_six_labels == ["B"] * 6 # use two pops to check against misremoval of renderers diff --git a/tests/core/representation/test_reduction.py b/tests/core/representation/test_reduction.py index 7e5259ff..3180c664 100644 --- a/tests/core/representation/test_reduction.py +++ b/tests/core/representation/test_reduction.py @@ -28,7 +28,7 @@ def test_dimensionality_reduction(n_points=1000): reducer.fit_transform( "umap", n_neighbors=3, min_dist=0.01, dimension=3, metric="euclidean" ) - embedding = reducer.transform(arr) + embedding = reducer.transform(arr, "umap") assert embedding.shape == (n_points, 3) embedding = reducer.transform(np.array([])) assert embedding.shape == (0,) diff --git a/tests/core/test_dataset.py b/tests/core/test_dataset.py index e98d9df4..5af9ee62 100644 --- a/tests/core/test_dataset.py +++ b/tests/core/test_dataset.py @@ -1,6 +1,7 @@ import pytest import os from hover.core.dataset import SupervisableTextDataset +from hover.module_config import DataFrame from bokeh.events import MenuItemClick @@ -37,6 +38,9 @@ def test_init(self): feature_key="content", label_key="mark", ) + for _df in dataset.dfs.values(): + assert isinstance(_df, DataFrame), f"Expecting DataFrame, got {type(_df)}" + dataset.validate_labels() # check the subset sizes @@ -69,7 +73,7 @@ def test_setup_label_coding(example_text_dataset): @pytest.mark.lite def test_validate_labels(example_text_dataset): dataset = example_text_dataset.copy() - dataset.dfs["train"].at[0, "label"] = "invalid_label" + dataset.dfs["train"].set_cell_by_row_column(0, "label", "invalid_label") try: dataset.validate_labels() @@ -83,7 +87,11 @@ def test_validate_labels(example_text_dataset): @pytest.mark.lite def test_compute_feature_index(example_text_dataset): dataset = example_text_dataset.copy() - dataset.dfs["raw"].at[0, "text"] = dataset.dfs["raw"].at[1, "text"] + dataset.dfs["raw"].set_cell_by_row_column( + 0, + "text", + dataset.dfs["raw"].get_cell_by_row_column(1, "text"), + ) try: dataset.compute_feature_index() @@ -95,7 +103,7 @@ def test_compute_feature_index(example_text_dataset): @pytest.mark.lite def test_locate_by_feature_value(example_text_dataset): dataset = example_text_dataset.copy() - feature_value = dataset.dfs["raw"].at[0, "text"] + feature_value = dataset.dfs["raw"].get_cell_by_row_column(0, "text") subset, index = dataset.locate_by_feature_value(feature_value) assert subset == "raw" and index == 0 @@ -137,7 +145,7 @@ def test_compute_nd_embedding(example_text_dataset, dummy_vectorizer): dataset.compute_nd_embedding(dummy_vectorizer, dimension=3) # empty one of the dfs; should not break the method - dataset.dfs["test"] = dataset.dfs["test"].loc[0:0] + dataset.dfs["test"] = dataset.dfs["test"][0:0] dataset.compute_2d_embedding(dummy_vectorizer) # verify that the vectorizer has been remembered diff --git a/tests/core/test_neural.py b/tests/core/test_neural.py index c05d1ed2..3870a26a 100644 --- a/tests/core/test_neural.py +++ b/tests/core/test_neural.py @@ -2,6 +2,7 @@ import numpy as np from copy import deepcopy from hover.core.neural import VectorNet +from hover.module_config import DataFrame @pytest.fixture @@ -84,7 +85,7 @@ def test_predict_proba(example_vecnet, example_text_dataset): def test_manifold_trajectory(example_vecnet, example_raw_df): for _method in ["umap", "ivis"]: traj_arr, seq_arr, disparities = example_vecnet.manifold_trajectory( - example_raw_df["text"].tolist() + DataFrame.series_tolist(example_raw_df["text"]) ) assert isinstance(traj_arr, np.ndarray) assert isinstance(seq_arr, np.ndarray) diff --git a/tests/module_config/hover_alt_config_1.ini b/tests/module_config/hover_alt_config_1.ini index 158a5715..d7972328 100644 --- a/tests/module_config/hover_alt_config_1.ini +++ b/tests/module_config/hover_alt_config_1.ini @@ -1,13 +1,16 @@ [visual] abstain_hexcolor = #b0b0b0 -#bokeh_palette = ["#b0ffff", "#ffb0ff", "#ffffb0", "#b0b0ff", "#b0ffb0", "#ffb0b0"] +bokeh_palette = ["#b0ffff", "#ffb0ff", "#ffffb0", "#b0b0ff", "#b0ffb0", "#ffb0b0", "#a0eeee", "#eea0ee", "#eeeea0", "#a0a0ee", "#a0eea0", "#eea0a0", "#90dddd", "#dd90dd", "#dddd90", "#9090dd", "#90dd90", "#dd9090", "#80cccc", "#cc80cc", "#cccc80", "#8080cc", "#80cc80", "#cc8080"] + +[backend] +dataframe_library = polars [data.embedding] -#default_reduction_method = ivis +default_reduction_method = ivis [data.values] abstain_decoded = label.abstain -abstain_encoded = -1 +abstain_encoded = -2 [data.columns] encoded_label_key = LABEL_ENCODED diff --git a/tests/recipes/local_helper.py b/tests/recipes/local_helper.py index cd88b6a8..7b036790 100644 --- a/tests/recipes/local_helper.py +++ b/tests/recipes/local_helper.py @@ -1,4 +1,5 @@ import time +import operator from bokeh.document import Document from bokeh.events import ButtonClick, MenuItemClick from hover import module_config @@ -28,22 +29,20 @@ def action_patch_selection(dataset): def action_apply_labels(annotator): apply_event = ButtonClick(annotator.annotator_apply) annotator.annotator_apply._trigger_event(apply_event) - labeled_slice = annotator.dfs["raw"][ - annotator.dfs["raw"]["label"] != module_config.ABSTAIN_DECODED - ] + labeled_slice = annotator.dfs["raw"].filter_rows_by_operator( + "label", operator.ne, module_config.ABSTAIN_DECODED + )() return labeled_slice def action_commit_selection(dataset, subset="train"): commit_event = MenuItemClick(dataset.data_committer, item=subset) dataset.data_committer._trigger_event(commit_event) - return dataset def action_deduplicate(dataset): dedup_event = ButtonClick(dataset.dedup_trigger) dataset.dedup_trigger._trigger_event(dedup_event) - return dataset def action_push_data(dataset): diff --git a/tests/recipes/test_experimental.py b/tests/recipes/test_experimental.py index 65ce6c6d..d789698f 100644 --- a/tests/recipes/test_experimental.py +++ b/tests/recipes/test_experimental.py @@ -6,17 +6,21 @@ active_learning, snorkel_crosscheck, ) +from hover.module_config import DataFrame as DF from bokeh.events import ButtonClick, SelectionGeometry from .local_helper import execute_handle_function def test_active_learning(example_text_dataset, dummy_vecnet_callback): + def read_scores(dataset, subset): + return DF.series_values(dataset.dfs[subset]["pred_score"]).copy() + dataset = example_text_dataset.copy() vecnet = dummy_vecnet_callback(dataset) layout, objects = _active_learning(dataset, vecnet) assert layout.visible - initial_scores = dataset.dfs["raw"]["pred_score"].values.copy() + initial_scores = read_scores(dataset, "raw") finder, annotator = objects["finder"], objects["annotator"] softlabel = objects["softlabel"] @@ -26,7 +30,7 @@ def test_active_learning(example_text_dataset, dummy_vecnet_callback): # train for default number of epochs model_trainer._trigger_event(train_event) - first_scores = dataset.dfs["raw"]["pred_score"].values.copy() + first_scores = read_scores(dataset, "raw") assert not np.allclose(first_scores, initial_scores) # emulating user interaction: slide coords to view manifold trajectory @@ -35,7 +39,7 @@ def test_active_learning(example_text_dataset, dummy_vecnet_callback): # train for 1 more epoch model_trainer._trigger_event(train_event) - second_scores = dataset.dfs["raw"]["pred_score"].values + second_scores = read_scores(dataset, "raw") assert not np.allclose(second_scores, first_scores) # take 25 and 75 percentiles of scores for later use range_low, range_high = np.percentile(second_scores, [25, 75]).tolist() diff --git a/tests/recipes/test_stable.py b/tests/recipes/test_stable.py index e4676470..bcfd69d3 100644 --- a/tests/recipes/test_stable.py +++ b/tests/recipes/test_stable.py @@ -1,4 +1,5 @@ import pytest +from hover.module_config import DataFrame from hover.recipes.stable import ( _simple_annotator, _linked_annotator, @@ -43,7 +44,9 @@ def subroutine_common_test(dataset): # evict a point from selection evict_idx = 5 # prepare expected values post eviction - expected_texts = dataset.dfs["raw"].loc[raw_view_select, feature].tolist() + expected_texts = DataFrame.series_tolist( + dataset.dfs["raw"].select_rows(raw_view_select)[feature] + ) expected_texts.pop(evict_idx) # make sub-selection dataset.sel_table.source.selected.indices = [evict_idx] @@ -90,16 +93,24 @@ def subroutine_common_test(dataset): assert subset_to_patch == "train" assert idx_to_patch == (raw_view_select + train_view_select)[raw_idx_to_patch] # prepare an edit patch - old_label = dataset.dfs[subset_to_patch].at[idx_to_patch, "label"] + old_label = dataset.dfs[subset_to_patch].get_cell_by_row_column( + idx_to_patch, "label" + ) new_label = PSEUDO_LABELS[0] if old_label == new_label: new_label = PSEUDO_LABELS[1] dataset.sel_table.source.data["label"][raw_idx_to_patch] = new_label dataset.sel_table.source.selected.indices = [raw_idx_to_patch] # execute patch - assert dataset.dfs[subset_to_patch].at[idx_to_patch, "label"] != new_label + assert ( + dataset.dfs[subset_to_patch].get_cell_by_row_column(idx_to_patch, "label") + != new_label + ) action_patch_selection(dataset) - assert dataset.dfs[subset_to_patch].at[idx_to_patch, "label"] == new_label + assert ( + dataset.dfs[subset_to_patch].get_cell_by_row_column(idx_to_patch, "label") + == new_label + ) @pytest.mark.lite diff --git a/tests/utils/test_dataframe.py b/tests/utils/test_dataframe.py new file mode 100644 index 00000000..a3e078de --- /dev/null +++ b/tests/utils/test_dataframe.py @@ -0,0 +1,502 @@ +from hover.utils.dataframe import ( + PandasDataframe, + PolarsDataframe, + convert_indices_to_list, + TYPE_TO_POLARS, +) +from pprint import pformat +import numpy as np +import pandas as pd +import polars as pl +import pytest +import operator + + +SERIES_VALUE_TEST_CASES = [ + list(range(30)), + list("abc" * 10), + [True, False] * 15, +] + +DATAFRAME_VALUE_TEST_CASES = [ + {"int": list(range(30)), "bool": [False, True] * 15, "str": list("abc" * 10)}, + { + "int": list(range(30)), + "bool": [True, False] * 15, + "array": [np.array([float(i)]) for i in range(30)], + }, +] + +HASHABLE_COLUMNS = ["int", "bool", "str"] + +ROW_INDICES_TEST_CASES = [ + [0, 1], + [], + np.array([0, 1, 3]), + slice(0, 10, 2), + range(0, 10, 2), + None, +] + + +def numpy_to_native(value): + """ + Convert numpy types to native types. + """ + if isinstance(value, np.ndarray): + value = value.tolist() + elif isinstance(value, np.generic): + value = value.item() + else: + assert isinstance(value, (list, tuple, dict, str, bool, int, float)) + return value + + +@pytest.mark.lite +class TestDataframe: + """ + Consistency tests across pandas, polars, and hover dataframes. + """ + + def _get_dataframes(self, df_data): + """ + Subroutine for creating dataframes in tests. + """ + df_pd = PandasDataframe.construct(df_data) + df_pl = PolarsDataframe.construct(df_data) + pd_df = pd.DataFrame(df_data) + pl_df = pl.DataFrame(df_data) + + return df_pd, df_pl, pd_df, pl_df + + def _assert_equivalent_dataframes(self, df_pd, df_pl, pd_df, pl_df): + """ + Subroutine for checking dataframe values. + """ + assert df_pd.equals(pd_df), f"{pformat(df_pd)}\n{pformat(pd_df)}" + assert df_pl.frame_equal(pl_df), f"{pformat(df_pl)}\n{pformat(pl_df)}" + assert df_pl.to_dicts() == df_pd.to_dict( + orient="records" + ), f"{pformat(df_pl)}\n{pformat(df_pd)}" + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + def test_basics(self, df_data): + """ + Constructor, `()`, `construct`, `copy`, `to_pandas`, `columns`, `shape`. + """ + pd_df = pd.DataFrame(df_data) + pl_df = pl.DataFrame(df_data) + df_pd = PandasDataframe(pd_df) + df_pl = PolarsDataframe(pl_df) + + assert df_pd() is pd_df + assert df_pl() is pl_df + + assert df_pd().equals(PandasDataframe.construct(df_data)()) + assert df_pl().frame_equal(PolarsDataframe.construct(df_data)()) + + assert df_pd.copy()() is not df_pd() + assert df_pl.copy()() is not df_pl() + + assert df_pd().equals(df_pl().to_pandas()) + assert df_pd().equals(df_pd.to_pandas()) + assert df_pd().equals(df_pl.to_pandas()) + + assert (df_pd.columns == df_pd().columns).all() + assert df_pl.columns == df_pl().columns + + assert df_pd.shape == df_pd().shape == df_pl.shape == df_pl().shape + + def test_empty_with_columns(self): + column_to_type = {"a": str, "b": int, "c": bool} + + df_pd = PandasDataframe.empty_with_columns(column_to_type) + df_pl = PolarsDataframe.empty_with_columns(column_to_type) + pd_df = pd.DataFrame(columns=column_to_type.keys()) + pl_df = pl.DataFrame( + schema={ + col: TYPE_TO_POLARS[_type] for col, _type in column_to_type.items() + }, + ) + + assert df_pd().equals(pd_df) + assert df_pl().frame_equal(pl_df) + assert df_pd.shape == df_pl.shape == (0, 3) + + @pytest.mark.parametrize("df_data_a", DATAFRAME_VALUE_TEST_CASES) + @pytest.mark.parametrize("df_data_b", DATAFRAME_VALUE_TEST_CASES[::-1]) + def test_concat_rows(self, df_data_a, df_data_b): + df_pd_a = PandasDataframe.construct(df_data_a) + df_pd_b = PandasDataframe.construct(df_data_b) + df_pl_a = PolarsDataframe.construct(df_data_a) + df_pl_b = PolarsDataframe.construct(df_data_b) + pd_df_a = df_pd_a() + pd_df_b = df_pd_b() + pl_df_a = df_pl_a() + pl_df_b = df_pl_b() + df_pd_ab = PandasDataframe.concat_rows([df_pd_a, df_pd_b]) + df_pl_ab = PolarsDataframe.concat_rows([df_pl_a, df_pl_b]) + + pd_df_ab = pd.concat([pd_df_a, pd_df_b], axis=0, ignore_index=True) + # use diagonal for non-overlapping columns + pl_df_ab = pl.concat([pl_df_a, pl_df_b], how="diagonal") + assert df_pd_ab().equals(pd_df_ab) + assert df_pl_ab().frame_equal(pl_df_ab) + assert df_pl_ab().to_pandas().equals(pd_df_ab) + + try: + _ = PandasDataframe.concat_rows([pd_df_a, pd_df_b]) + raise Exception("Should have raised an AssertionError") + except AssertionError: + pass + + try: + _ = PolarsDataframe.concat_rows([pl_df_a, pl_df_b]) + raise Exception("Should have raised an AssertionError") + except AssertionError: + pass + + @pytest.mark.parametrize("values", SERIES_VALUE_TEST_CASES) + def test_series_class_methods(self, values): + np_values = np.array(values) + pd_series = pd.Series(values) + pl_series = pl.Series(values) + values_pd = PandasDataframe.series_values(pd_series) + values_pl = PolarsDataframe.series_values(pl_series) + pd_values = pd_series.values + pl_values = pl_series.to_numpy() + + assert np.equal(pd_values, np_values).all() + assert np.equal(pl_values, np_values).all() + assert np.equal(values_pd, np_values).all() + assert np.equal(values_pl, np_values).all() + + list_pd = PandasDataframe.series_tolist(pd_series) + list_pl = PolarsDataframe.series_tolist(pl_series) + + assert list_pd == list_pl == list(values) + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + def test_to_dict_of_lists(self, df_data): + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + + df_pd_dict = df_pd.to_dict_of_lists() + df_pl_dict = df_pl.to_dict_of_lists() + pd_df_dict = pd_df.to_dict(orient="list") + pl_df_dict = pl_df.to_dict(as_series=False) + + assert df_pd_dict == df_pl_dict == pd_df_dict == pl_df_dict + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + def test_to_list_of_dicts(self, df_data): + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + + df_pd_dictl = df_pd.to_list_of_dicts() + df_pl_dictl = df_pl.to_list_of_dicts() + pd_df_dictl = pd_df.to_dict(orient="records") + pl_df_dictl = pl_df.to_dicts() + + assert df_pd_dictl == df_pl_dictl == pd_df_dictl == pl_df_dictl + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + def test_get_row_as_dict(self, df_data): + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + + row_pd = df_pd.get_row_as_dict(0) + row_pl = df_pl.get_row_as_dict(0) + pd_row = pd_df.iloc[0].to_dict() + pl_row = pl_df.row(0, named=True) + + assert row_pd == row_pl == pd_row == pl_row + + try: + _ = df_pd.get_row_as_dict([0, 1]) + raise Exception("Should have raised an AssertionError") + except AssertionError: + pass + + try: + _ = df_pl.get_row_as_dict([0, 1]) + raise Exception("Should have raised an AssertionError") + except AssertionError: + pass + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + @pytest.mark.parametrize("indices", ROW_INDICES_TEST_CASES) + def test_select_rows(self, df_data, indices): + if indices is None: + return + + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + + df_pd_rows = df_pd.select_rows(indices)() + df_pl_rows = df_pl.select_rows(indices)() + + indices_list = convert_indices_to_list(indices, size=df_pd.shape[0]) + if len(indices_list) == 0: + pd_df_rows = pd.DataFrame(columns=pd_df.columns) + pl_df_rows = pl.DataFrame({}, schema=pl_df.schema) + else: + pd_df_rows = pd_df.iloc[indices_list] + pl_df_rows = pl_df[indices_list] + self._assert_equivalent_dataframes( + df_pd_rows, + df_pl_rows, + pd_df_rows, + pl_df_rows, + ) + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + def test_filter_rows_by_operator(self, df_data): + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + + for _op in [operator.eq, operator.ne, operator.gt, operator.lt]: + _df_pd_slice = df_pd.filter_rows_by_operator("int", _op, 5)() + _pd_df_slice = pd_df[pd_df["int"].apply(lambda x: _op(x, 5))].reset_index( + drop=True + ) + _df_pl_slice = df_pl.filter_rows_by_operator("int", _op, 5)() + _pl_df_slice = pl_df[np.where(_op(pl_df["int"], 5))[0]] + self._assert_equivalent_dataframes( + _df_pd_slice, + _df_pl_slice, + _pd_df_slice, + _pl_df_slice, + ) + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + def test_unique(self, df_data): + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + + df_pd_unique = df_pd.unique("bool", keep="last")() + pd_df_unique = pd_df.drop_duplicates("bool", keep="last").reset_index(drop=True) + df_pl_unique = df_pl.unique("bool", keep="last")() + pl_df_unique = pl_df.unique("bool", keep="last", maintain_order=True) + + self._assert_equivalent_dataframes( + df_pd_unique, + df_pl_unique, + pd_df_unique, + pl_df_unique, + ) + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + @pytest.mark.parametrize("indices", ROW_INDICES_TEST_CASES) + def test_set_column_by_constant(self, df_data, indices): + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + indices_list = ( + list(range(df_pd.shape[0])) + if indices is None + else convert_indices_to_list(indices, size=df_pd.shape[0]) + ) + + col = df_pd.columns[0] + value = df_pd.get_row_as_dict(0)[col] + + df_pd.set_column_by_constant(col, value, indices) + pd_df.loc[indices_list, col] = value + df_pl.set_column_by_constant(col, value, indices) + for i in indices_list: + pl_df[i, col] = value + + self._assert_equivalent_dataframes(df_pd(), df_pl(), pd_df, pl_df) + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + @pytest.mark.parametrize("indices", ROW_INDICES_TEST_CASES) + def test_set_column_by_array(self, df_data, indices): + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + indices_list = ( + list(range(df_pd.shape[0])) + if indices is None + else convert_indices_to_list(indices, size=df_pd.shape[0]) + ) + col = df_pd.columns[0] + values = df_pd.select_rows(indices)[col].values + + df_pd.set_column_by_array(col, values, indices) + pd_df.loc[indices_list, col] = values + df_pl.set_column_by_array(col, values, indices) + lookup = dict(zip(indices_list, values)) + pl_df = pl_df.update( + pl.DataFrame({col: [lookup.get(i, None) for i in range(pl_df.shape[0])]}) + ) + + self._assert_equivalent_dataframes(df_pd(), df_pl(), pd_df, pl_df) + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + @pytest.mark.parametrize("indices", ROW_INDICES_TEST_CASES) + def test_column_map(self, df_data, indices): + for col in HASHABLE_COLUMNS: + if col not in df_data.keys(): + continue + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + mapping = pd_df[col].value_counts().to_dict() + if col == "str": + mapping = { + _k: "#b0b0b0" for _k in pd_df[col].value_counts().to_dict().keys() + } + indices_list = ( + list(range(df_pd.shape[0])) + if indices is None + else convert_indices_to_list(indices, size=df_pd.shape[0]) + ) + + pd_df_series = pd_df.loc[indices_list, col].map(mapping) + df_pd_series = df_pd.column_map( + col, mapping, indices=indices, output="series" + ) + df_pl_numpy = df_pl.column_map( + col, mapping, indices=indices, output="numpy" + ) + df_pl_list = df_pl.column_map(col, mapping, indices=indices, output="list") + assert df_pd_series.equals(pd_df_series) + assert np.equal(df_pl_numpy, pd_df_series.values).all() + assert df_pl_list == pd_df_series.tolist() + + df_pd.column_map(col, mapping, indices=None, as_column="result") + df_pl.column_map(col, mapping, indices=None, as_column="result") + assert not df_pd().equals(pd_df) + assert not df_pl().frame_equal(pl_df) + assert df_pd().equals(df_pl.to_pandas()) + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + @pytest.mark.parametrize("indices", ROW_INDICES_TEST_CASES) + def test_column_isin(self, df_data, indices): + for col in HASHABLE_COLUMNS: + if col not in df_data.keys(): + continue + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + lookup = set(pd_df.loc[::2, col].values) + indices_list = ( + list(range(df_pd.shape[0])) + if indices is None + else convert_indices_to_list(indices, size=df_pd.shape[0]) + ) + + pd_df_series = pd_df.loc[indices_list, col].isin(lookup) + df_pd_series = df_pd.column_isin( + col, lookup, indices=indices, output="series" + ) + df_pl_numpy = df_pl.column_isin( + col, lookup, indices=indices, output="numpy" + ) + df_pl_list = df_pl.column_isin(col, lookup, indices=indices, output="list") + assert df_pd_series.equals(pd_df_series) + assert np.equal(df_pl_numpy, pd_df_series.values).all() + assert df_pl_list == pd_df_series.tolist() + + df_pd.column_isin(col, lookup, indices=None, as_column="result") + df_pl.column_isin(col, lookup, indices=None, as_column="result") + assert not df_pd().equals(pd_df) + assert not df_pl().frame_equal(pl_df) + assert df_pd().equals(df_pl.to_pandas()) + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + @pytest.mark.parametrize("indices", ROW_INDICES_TEST_CASES) + def test_column_apply(self, df_data, indices): + df_tmp, _, _, _ = self._get_dataframes(df_data) + + def func(x): + if isinstance(x, (str, int, float)): + return x * 2 + elif hasattr(x, "__iter__"): + return sum(x) + else: + return str(x) + + for col in df_tmp.columns: + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + indices_list = ( + list(range(df_pd.shape[0])) + if indices is None + else convert_indices_to_list(indices, size=df_pd.shape[0]) + ) + + pd_df_series = pd_df.loc[indices_list, col].apply(func) + df_pd_series = df_pd.column_apply( + col, func, indices=indices, output="series" + ) + df_pl_numpy = df_pl.column_apply(col, func, indices=indices, output="numpy") + df_pl_list = df_pl.column_apply(col, func, indices=indices, output="list") + assert df_pd_series.equals(pd_df_series) + assert np.equal(df_pl_numpy, pd_df_series.values).all() + assert df_pl_list == pd_df_series.tolist() + + df_pd.column_apply(col, func, indices=None, as_column="result") + df_pl.column_apply(col, func, indices=None, as_column="result") + assert not df_pd().equals(pd_df) + assert not df_pl().frame_equal(pl_df) + assert df_pd().equals(df_pl.to_pandas()) + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + @pytest.mark.parametrize("indices", ROW_INDICES_TEST_CASES) + def test_row_apply(self, df_data, indices): + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + indices_list = ( + list(range(df_pd.shape[0])) + if indices is None + else convert_indices_to_list(indices, size=df_pd.shape[0]) + ) + + def func(row): + return str(row["int"]) + + pd_df_series = pd_df.loc[indices_list].apply(func, axis=1) + df_pd_series = df_pd.row_apply(func, indices=indices, output="series") + df_pl_numpy = df_pl.row_apply(func, indices=indices, output="numpy") + df_pl_list = df_pl.row_apply(func, indices=indices, output="list") + assert df_pd_series.equals(pd_df_series) + assert np.equal(df_pl_numpy, pd_df_series.values).all() + assert df_pl_list == pd_df_series.tolist() + + df_pd.row_apply(func, indices=None, as_column="result") + df_pl.row_apply(func, indices=None, as_column="result") + assert not df_pd().equals(pd_df) + assert not df_pl().frame_equal(pl_df) + assert df_pd().equals(df_pl.to_pandas()) + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + def test_get_cell_by_row_column(self, df_data): + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + for col in df_pd.columns: + row = np.random.randint(0, df_pd.shape[0]) + + df_pd_val = df_pd.get_cell_by_row_column(row, col) + df_pl_val = df_pl.get_cell_by_row_column(row, col) + pd_df_val = pd_df.at[row, col] + pl_df_val = pl_df.row(row, named=True)[col] + + if isinstance(df_pd_val, np.ndarray): + assert np.equal(df_pd_val, df_pl_val).all() + assert np.equal(df_pd_val, pd_df_val).all() + assert np.equal(df_pd_val, pl_df_val).all() + else: + assert df_pd_val == df_pl_val == pd_df_val == pl_df_val + + @pytest.mark.parametrize("df_data", DATAFRAME_VALUE_TEST_CASES) + def test_set_cell_by_row_column(self, df_data): + df_pd, df_pl, pd_df, pl_df = self._get_dataframes(df_data) + for col in df_pd.columns: + row = np.random.randint(0, df_pd.shape[0] // 2) + old_value = df_pd.get_cell_by_row_column(row, col) + value = df_pd.get_cell_by_row_column(df_pd.shape[0] - 1 - row, col) + value = numpy_to_native(value) + + # as of Apr 2023: pyarrow does not support assigning a list to a polars cell + tolerated = pl.exceptions.ArrowError if isinstance(value, list) else None + + try: + df_pd.set_cell_by_row_column(row, col, value) + df_pl.set_cell_by_row_column(row, col, value) + assert pd_df.at[row, col] == pl_df[row, col] == old_value + + df_pd_val = df_pd.get_cell_by_row_column(row, col) + df_pl_val = df_pl.get_cell_by_row_column(row, col) + assert df_pd_val == df_pl_val == value + + pd_df.at[row, col] = value + pl_df[row, col] = value + self._assert_equivalent_dataframes(df_pd(), df_pl(), pd_df, pl_df) + except Exception as e: + if tolerated is None or not isinstance(e, tolerated): + raise e diff --git a/tests/utils/test_snorkel_helper.py b/tests/utils/test_snorkel_helper.py index 52247443..02280a89 100644 --- a/tests/utils/test_snorkel_helper.py +++ b/tests/utils/test_snorkel_helper.py @@ -8,7 +8,7 @@ def original(row): return "long" if len(row["text"]) > 5 else "short" targets = ["long", "short"] - one_row = example_raw_df.iloc[0] + one_row = example_raw_df.get_row_as_dict(0) # create LF with pre-determined label encodings label_encoder = {t: i for i, t in enumerate(targets)} diff --git a/tests/utils/test_typecheck.py b/tests/utils/test_typecheck.py new file mode 100644 index 00000000..8ce1e2de --- /dev/null +++ b/tests/utils/test_typecheck.py @@ -0,0 +1,28 @@ +from hover.utils.typecheck import TypedValueDict +from collections import defaultdict + + +class TestTypedValueDict: + def test_basic(self): + tdict = TypedValueDict(int) + tdict["key1"] = 1 + assert tdict["key1"] == 1 + + tdict.update({"key2": 2, "key3": 3}) + assert tdict["key2"] == 2 + assert tdict["key3"] == 3 + + try: + tdict["key4"] = "4" + raise AssertionError("Should have raised TypeError") + except TypeError: + pass + + def test_subclass(self): + tdict = TypedValueDict(dict) + tdict["key1"] = {"foo": "bar"} + assert tdict["key1"] == {"foo": "bar"} + + ddict = defaultdict(str) + tdict.update({"key2": ddict}) + assert tdict["key2"] is ddict