From 1995f538e973be2beb340f375e8f753b0e3284a6 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Wed, 27 Nov 2024 15:16:01 +0100 Subject: [PATCH 01/12] wip --- src/mostlyai/qa/common.py | 29 +++++++++++++++++++---------- src/mostlyai/qa/report.py | 33 +++++++++++++++++++-------------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/mostlyai/qa/common.py b/src/mostlyai/qa/common.py index f6ad52e..a3e3699 100644 --- a/src/mostlyai/qa/common.py +++ b/src/mostlyai/qa/common.py @@ -13,9 +13,11 @@ # limitations under the License. import logging -from typing import Protocol +from functools import partial +from typing import Protocol, Callable import pandas as pd +from rich.progress import Progress from tqdm.auto import tqdm from mostlyai.qa.filesystem import Statistics @@ -73,18 +75,25 @@ class PrerequisiteNotMetError(Exception): class ProgressCallback(Protocol): - def __call__(self, current: int, total: int) -> None: ... - + def __call__( + self, total: float | None = None, completed: float | None = None, advance: float | None = None, **kwargs + ) -> None: ... + +def wrap_progress_callback(update_progress: ProgressCallback | None = None, **kwargs) -> tuple[ProgressCallback, Callable]: + if not update_progress: + rich_progress = Progress() + rich_progress.start() + task_id = rich_progress.add_task(**kwargs) + update_progress = partial(rich_progress.update, task_id=task_id) + else: + rich_progress = None -def add_tqdm(on_progress: ProgressCallback | None = None, description: str = "Processing") -> ProgressCallback: - pbar = tqdm(desc=description, total=100) + def teardown_wrapped_progress_callback(*args, **kwargs): + if rich_progress: + rich_progress.stop() - def _on_progress(current: int, total: int): - if on_progress is not None: - on_progress(current, total) - pbar.update(current - pbar.n) + return update_progress, teardown_wrapped_progress_callback - return _on_progress def check_min_sample_size(size: int, min: int, type: str) -> None: diff --git a/src/mostlyai/qa/report.py b/src/mostlyai/qa/report.py index b0c8d9d..45ab5d6 100644 --- a/src/mostlyai/qa/report.py +++ b/src/mostlyai/qa/report.py @@ -13,11 +13,15 @@ # limitations under the License. import logging +from functools import partial +from idlelib.debugger_r import wrap_info from pathlib import Path import numpy as np import pandas as pd +from docutils.nodes import description from pandas.core.dtypes.common import is_numeric_dtype, is_datetime64_dtype +from rich.progress import Progress from mostlyai.qa import distances, similarity, html_report from mostlyai.qa.accuracy import ( @@ -46,7 +50,7 @@ NXT_COLUMN, CTX_COLUMN_PREFIX, TGT_COLUMN_PREFIX, - REPORT_CREDITS, + REPORT_CREDITS, wrap_progress_callback, ) from mostlyai.qa.filesystem import Statistics, TemporaryWorkspace @@ -71,7 +75,7 @@ def report( max_sample_size_accuracy: int | None = None, max_sample_size_embeddings: int | None = None, statistics_path: str | Path | None = None, - on_progress: ProgressCallback | None = None, + update_progress: ProgressCallback | None = None, ) -> tuple[Path, Metrics | None]: """ Generate HTML report and metrics for comparing synthetic and original data samples. @@ -93,7 +97,7 @@ def report( max_sample_size_accuracy: Max sample size for accuracy max_sample_size_embeddings: Max sample size for embeddings (similarity & distances) statistics_path: Path of where to store the statistics to be used by `report_from_statistics` - on_progress: A custom progress callback + update_progress: A custom progress callback Returns: 1. Path to the HTML report 2. Pydantic Metrics: @@ -120,8 +124,8 @@ def report( """ with TemporaryWorkspace() as workspace: - on_progress = add_tqdm(on_progress, description="Creating report") - on_progress(current=0, total=100) + update_progress, teardown_progress = wrap_progress_callback(update_progress, description="Creating report") + update_progress(completed=0, total=100) # ensure all columns are present and in the same order as training data syn_tgt_data = syn_tgt_data[trn_tgt_data.columns] @@ -165,7 +169,7 @@ def report( _LOG.info(err) statistics.mark_early_exit() html_report.store_early_exit_report(report_path) - on_progress(current=100, total=100) + update_progress(completed=100, total=100) return report_path, None # prepare datasets for accuracy @@ -194,7 +198,7 @@ def report( max_sample_size=max_sample_size_accuracy, setup=setup, ) - on_progress(current=5, total=100) + update_progress(completed=5, total=100) _LOG.info("prepare training data for accuracy started") trn = pull_data_for_accuracy( @@ -205,7 +209,7 @@ def report( max_sample_size=max_sample_size_accuracy, setup=setup, ) - on_progress(current=10, total=100) + update_progress(completed=10, total=100) # coerce dtypes to match the original training data dtypes for col in trn: @@ -222,7 +226,7 @@ def report( statistics=statistics, workspace=workspace, ) - on_progress(current=20, total=100) + update_progress(completed=20, total=100) # ensure that embeddings are all equal size for a fair 3-way comparison max_sample_size_embeddings = min( @@ -245,7 +249,7 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st embeds = [] for i, bucket in enumerate(buckets, 1): embeds += [calculate_embeddings(bucket.tolist())] - on_progress(current=start + i, total=100) + update_progress(completed=start + i, total=100) embeds = np.concatenate(embeds, axis=0) _LOG.info(f"calculated embeddings {embeds.shape}") return embeds @@ -256,7 +260,7 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st hol_embeds = _calc_pull_embeds(df_tgt=hol_tgt_data, df_ctx=hol_ctx_data, start=60, stop=80) else: hol_embeds = None - on_progress(current=80, total=100) + update_progress(completed=80, total=100) _LOG.info("report similarity") sim_cosine_trn_hol, sim_cosine_trn_syn, sim_auc_trn_hol, sim_auc_trn_syn = report_similarity( @@ -266,7 +270,7 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st workspace=workspace, statistics=statistics, ) - on_progress(current=90, total=100) + update_progress(completed=90, total=100) _LOG.info("report distances") dcr_trn, dcr_hol = report_distances( @@ -275,7 +279,7 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st hol_embeds=hol_embeds, workspace=workspace, ) - on_progress(current=99, total=100) + update_progress(completed=99, total=100) metrics = calculate_metrics( acc_uni=acc_uni, @@ -312,7 +316,8 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st acc_biv=acc_biv, corr_trn=corr_trn, ) - on_progress(current=100, total=100) + update_progress(completed=100, total=100) + teardown_progress() return report_path, metrics From 90d3e5b01f7bf7002beec9b6292a13b7ec020cbf Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Wed, 27 Nov 2024 15:17:47 +0100 Subject: [PATCH 02/12] wip --- src/mostlyai/qa/report.py | 1 - src/mostlyai/qa/report_from_statistics.py | 19 +++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/mostlyai/qa/report.py b/src/mostlyai/qa/report.py index 45ab5d6..e4e3cea 100644 --- a/src/mostlyai/qa/report.py +++ b/src/mostlyai/qa/report.py @@ -46,7 +46,6 @@ ProgressCallback, PrerequisiteNotMetError, check_min_sample_size, - add_tqdm, NXT_COLUMN, CTX_COLUMN_PREFIX, TGT_COLUMN_PREFIX, diff --git a/src/mostlyai/qa/report_from_statistics.py b/src/mostlyai/qa/report_from_statistics.py index e8815ae..9216057 100644 --- a/src/mostlyai/qa/report_from_statistics.py +++ b/src/mostlyai/qa/report_from_statistics.py @@ -26,10 +26,9 @@ ProgressCallback, PrerequisiteNotMetError, check_min_sample_size, - add_tqdm, check_statistics_prerequisite, determine_data_size, - REPORT_CREDITS, + REPORT_CREDITS, wrap_progress_callback, ) from mostlyai.qa.filesystem import Statistics, TemporaryWorkspace @@ -50,11 +49,11 @@ def report_from_statistics( report_extra_info: str = "", max_sample_size_accuracy: int | None = None, max_sample_size_embeddings: int | None = None, - on_progress: ProgressCallback | None = None, + update_progress: ProgressCallback | None = None, ) -> Path: with TemporaryWorkspace() as workspace: - on_progress = add_tqdm(on_progress, description="Creating report from statistics") - on_progress(current=0, total=100) + update_progress, teardown_progress = wrap_progress_callback(update_progress, description="Creating report from statistics") + update_progress(completed=0, total=100) # prepare report_path if report_path is None: @@ -73,7 +72,7 @@ def report_from_statistics( check_min_sample_size(syn_sample_size, 100, "synthetic") except PrerequisiteNotMetError: html_report.store_early_exit_report(report_path) - on_progress(current=100, total=100) + update_progress(completed=100, total=100) return report_path meta = statistics.load_meta() @@ -96,7 +95,7 @@ def report_from_statistics( max_sample_size=max_sample_size_accuracy, ) _LOG.info(f"sample synthetic data finished ({syn.shape=})") - on_progress(current=20, total=100) + update_progress(completed=20, total=100) # calculate and plot accuracy and correlations acc_uni, acc_biv, corr_trn = report_accuracy_and_correlations_from_statistics( @@ -104,7 +103,7 @@ def report_from_statistics( statistics=statistics, workspace=workspace, ) - on_progress(current=30, total=100) + update_progress(completed=30, total=100) _LOG.info("calculate embeddings for synthetic") syn_embeds = calculate_embeddings( @@ -123,7 +122,7 @@ def report_from_statistics( workspace=workspace, statistics=statistics, ) - on_progress(current=50, total=100) + update_progress(completed=50, total=100) meta |= { "rows_synthetic": syn.shape[0], @@ -144,7 +143,7 @@ def report_from_statistics( acc_biv=acc_biv, corr_trn=corr_trn, ) - on_progress(current=100, total=100) + update_progress(completed=100, total=100) return report_path From 42d9f0d3b97d0ae7c7dc600eca83646bc2efae7a Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Wed, 27 Nov 2024 15:20:40 +0100 Subject: [PATCH 03/12] wip --- src/mostlyai/qa/report.py | 8 +++----- src/mostlyai/qa/report_from_statistics.py | 9 +++++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/mostlyai/qa/report.py b/src/mostlyai/qa/report.py index e4e3cea..fedf919 100644 --- a/src/mostlyai/qa/report.py +++ b/src/mostlyai/qa/report.py @@ -13,15 +13,11 @@ # limitations under the License. import logging -from functools import partial -from idlelib.debugger_r import wrap_info from pathlib import Path import numpy as np import pandas as pd -from docutils.nodes import description from pandas.core.dtypes.common import is_numeric_dtype, is_datetime64_dtype -from rich.progress import Progress from mostlyai.qa import distances, similarity, html_report from mostlyai.qa.accuracy import ( @@ -49,7 +45,8 @@ NXT_COLUMN, CTX_COLUMN_PREFIX, TGT_COLUMN_PREFIX, - REPORT_CREDITS, wrap_progress_callback, + REPORT_CREDITS, + wrap_progress_callback, ) from mostlyai.qa.filesystem import Statistics, TemporaryWorkspace @@ -169,6 +166,7 @@ def report( statistics.mark_early_exit() html_report.store_early_exit_report(report_path) update_progress(completed=100, total=100) + teardown_progress() return report_path, None # prepare datasets for accuracy diff --git a/src/mostlyai/qa/report_from_statistics.py b/src/mostlyai/qa/report_from_statistics.py index 9216057..eeaf7ff 100644 --- a/src/mostlyai/qa/report_from_statistics.py +++ b/src/mostlyai/qa/report_from_statistics.py @@ -28,7 +28,8 @@ check_min_sample_size, check_statistics_prerequisite, determine_data_size, - REPORT_CREDITS, wrap_progress_callback, + REPORT_CREDITS, + wrap_progress_callback, ) from mostlyai.qa.filesystem import Statistics, TemporaryWorkspace @@ -52,7 +53,9 @@ def report_from_statistics( update_progress: ProgressCallback | None = None, ) -> Path: with TemporaryWorkspace() as workspace: - update_progress, teardown_progress = wrap_progress_callback(update_progress, description="Creating report from statistics") + update_progress, teardown_progress = wrap_progress_callback( + update_progress, description="Creating report from statistics" + ) update_progress(completed=0, total=100) # prepare report_path @@ -73,6 +76,7 @@ def report_from_statistics( except PrerequisiteNotMetError: html_report.store_early_exit_report(report_path) update_progress(completed=100, total=100) + teardown_progress() return report_path meta = statistics.load_meta() @@ -144,6 +148,7 @@ def report_from_statistics( corr_trn=corr_trn, ) update_progress(completed=100, total=100) + teardown_progress() return report_path From 49fb3fb04d52dc2c705610fc77d7b508dad3ef14 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Wed, 27 Nov 2024 15:32:27 +0100 Subject: [PATCH 04/12] wip --- src/mostlyai/qa/common.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/mostlyai/qa/common.py b/src/mostlyai/qa/common.py index a3e3699..137951f 100644 --- a/src/mostlyai/qa/common.py +++ b/src/mostlyai/qa/common.py @@ -18,7 +18,6 @@ import pandas as pd from rich.progress import Progress -from tqdm.auto import tqdm from mostlyai.qa.filesystem import Statistics @@ -79,7 +78,10 @@ def __call__( self, total: float | None = None, completed: float | None = None, advance: float | None = None, **kwargs ) -> None: ... -def wrap_progress_callback(update_progress: ProgressCallback | None = None, **kwargs) -> tuple[ProgressCallback, Callable]: + +def wrap_progress_callback( + update_progress: ProgressCallback | None = None, **kwargs +) -> tuple[ProgressCallback, Callable]: if not update_progress: rich_progress = Progress() rich_progress.start() @@ -88,14 +90,13 @@ def wrap_progress_callback(update_progress: ProgressCallback | None = None, **kw else: rich_progress = None - def teardown_wrapped_progress_callback(*args, **kwargs): + def teardown_wrapped_progress_callback(): if rich_progress: rich_progress.stop() return update_progress, teardown_wrapped_progress_callback - def check_min_sample_size(size: int, min: int, type: str) -> None: if size < min: raise PrerequisiteNotMetError(f"At least {min} rows are required, but only {size} were found for {type}.") From 66bbaf8b090c02bffef061915f8d8516bf5f534b Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Wed, 27 Nov 2024 15:34:17 +0100 Subject: [PATCH 05/12] wip --- src/mostlyai/qa/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mostlyai/qa/common.py b/src/mostlyai/qa/common.py index 137951f..d1e309c 100644 --- a/src/mostlyai/qa/common.py +++ b/src/mostlyai/qa/common.py @@ -90,11 +90,11 @@ def wrap_progress_callback( else: rich_progress = None - def teardown_wrapped_progress_callback(): + def teardown_progress(): if rich_progress: rich_progress.stop() - return update_progress, teardown_wrapped_progress_callback + return update_progress, teardown_progress def check_min_sample_size(size: int, min: int, type: str) -> None: From 179f74ca217377e6d3277f2be0ac2df77b52a300 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Wed, 27 Nov 2024 16:01:50 +0100 Subject: [PATCH 06/12] wip --- poetry.lock | 4 ++-- pyproject.toml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index feafa92..b5524ab 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "annotated-types" @@ -3035,4 +3035,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "84b6afdb4c4101d2a0fbe8aeddabb53e03e9aa5886dd1e8bf95181c24c17c349" +content-hash = "12fab0b7c571095916f88504bb577687deb6c8d23b82ac4530ca2e82c1170980" diff --git a/pyproject.toml b/pyproject.toml index bcaa2b1..85f9d4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ joblib = ">=1.2.0" Jinja2 = ">=3.1.2" scikit-learn = ">=1.4.0" sentence-transformers = ">=3.1.0" +rich = "^13.9.4" [tool.poetry.group.dev.dependencies] ruff = "0.7.0" From b6816300e32c77cd6ca6d1facf506c8205a2172e Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 28 Nov 2024 15:06:04 +0100 Subject: [PATCH 07/12] simplify --- src/mostlyai/qa/common.py | 47 ++++++++++++++++------- src/mostlyai/qa/report.py | 30 +++++++-------- src/mostlyai/qa/report_from_statistics.py | 22 ++++------- 3 files changed, 54 insertions(+), 45 deletions(-) diff --git a/src/mostlyai/qa/common.py b/src/mostlyai/qa/common.py index d1e309c..4d9e2dc 100644 --- a/src/mostlyai/qa/common.py +++ b/src/mostlyai/qa/common.py @@ -79,22 +79,41 @@ def __call__( ) -> None: ... -def wrap_progress_callback( - update_progress: ProgressCallback | None = None, **kwargs -) -> tuple[ProgressCallback, Callable]: - if not update_progress: - rich_progress = Progress() - rich_progress.start() - task_id = rich_progress.add_task(**kwargs) - update_progress = partial(rich_progress.update, task_id=task_id) - else: - rich_progress = None +class ProgressCallbackWrapper: + @staticmethod + def _wrap_progress_callback( + update_progress: ProgressCallback | None = None, **kwargs + ) -> tuple[ProgressCallback, Callable]: + if not update_progress: + rich_progress = Progress() + rich_progress.start() + task_id = rich_progress.add_task(**kwargs) + update_progress = partial(rich_progress.update, task_id=task_id) + else: + rich_progress = None + + def teardown_progress(): + if rich_progress: + rich_progress.stop() + + return update_progress, teardown_progress + + def update( + self, total: float | None = None, completed: float | None = None, advance: float | None = None, **kwargs + ) -> None: + self.update_progress(total=total, completed=completed, advance=advance, **kwargs) + + def __init__(self, update_progress: ProgressCallback | None = None, **kwargs): + self.update_progress, self.teardown_progress = self._wrap_progress_callback(update_progress, **kwargs) - def teardown_progress(): - if rich_progress: - rich_progress.stop() + def __enter__(self): + self.update_progress(completed=0, total=1) + return self - return update_progress, teardown_progress + def __exit__(self, exc_type, exc_value, traceback): + if exc_type is not None: + self.update_progress(completed=1, total=1) + self.teardown_progress() def check_min_sample_size(size: int, min: int, type: str) -> None: diff --git a/src/mostlyai/qa/report.py b/src/mostlyai/qa/report.py index 46cc52e..9514181 100644 --- a/src/mostlyai/qa/report.py +++ b/src/mostlyai/qa/report.py @@ -46,7 +46,7 @@ CTX_COLUMN_PREFIX, TGT_COLUMN_PREFIX, REPORT_CREDITS, - wrap_progress_callback, + ProgressCallbackWrapper, ) from mostlyai.qa.filesystem import Statistics, TemporaryWorkspace @@ -119,10 +119,10 @@ def report( - `dcr_share`: Share of synthetic samples that are closer to a training sample than to a holdout sample. This shall not be significantly larger than 50\%. """ - with TemporaryWorkspace() as workspace: - update_progress, teardown_progress = wrap_progress_callback(update_progress, description="Creating report") - update_progress(completed=0, total=100) - + with ( + TemporaryWorkspace() as workspace, + ProgressCallbackWrapper(update_progress, description="Creating report") as progress, + ): # ensure all columns are present and in the same order as training data syn_tgt_data = syn_tgt_data[trn_tgt_data.columns] if hol_tgt_data is not None: @@ -165,8 +165,6 @@ def report( _LOG.info(err) statistics.mark_early_exit() html_report.store_early_exit_report(report_path) - update_progress(completed=100, total=100) - teardown_progress() return report_path, None # prepare datasets for accuracy @@ -195,7 +193,7 @@ def report( max_sample_size=max_sample_size_accuracy, setup=setup, ) - update_progress(completed=5, total=100) + progress.update(completed=5, total=100) _LOG.info("prepare training data for accuracy started") trn = pull_data_for_accuracy( @@ -206,7 +204,7 @@ def report( max_sample_size=max_sample_size_accuracy, setup=setup, ) - update_progress(completed=10, total=100) + progress.update(completed=10, total=100) # coerce dtypes to match the original training data dtypes for col in trn: @@ -223,7 +221,7 @@ def report( statistics=statistics, workspace=workspace, ) - update_progress(completed=20, total=100) + progress.update(completed=20, total=100) # ensure that embeddings are all equal size for a fair 3-way comparison max_sample_size_embeddings = min( @@ -247,8 +245,8 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st embeds = [] for i, bucket in enumerate(buckets, 1): embeds += [calculate_embeddings(bucket.tolist())] - update_progress(completed=start + i, total=100) - update_progress(completed=stop, total=100) + progress.update(completed=start + i, total=100) + progress.update(completed=stop, total=100) embeds = np.concatenate(embeds, axis=0) _LOG.info(f"calculated embeddings {embeds.shape}") return embeds @@ -259,7 +257,7 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st hol_embeds = _calc_pull_embeds(df_tgt=hol_tgt_data, df_ctx=hol_ctx_data, start=60, stop=80) else: hol_embeds = None - update_progress(completed=80, total=100) + progress.update(completed=80, total=100) _LOG.info("report similarity") sim_cosine_trn_hol, sim_cosine_trn_syn, sim_auc_trn_hol, sim_auc_trn_syn = report_similarity( @@ -269,7 +267,7 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st workspace=workspace, statistics=statistics, ) - update_progress(completed=90, total=100) + progress.update(completed=90, total=100) _LOG.info("report distances") dcr_trn, dcr_hol = report_distances( @@ -278,7 +276,7 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st hol_embeds=hol_embeds, workspace=workspace, ) - update_progress(completed=99, total=100) + progress.update(completed=99, total=100) metrics = calculate_metrics( acc_uni=acc_uni, @@ -315,8 +313,6 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st acc_biv=acc_biv, corr_trn=corr_trn, ) - update_progress(completed=100, total=100) - teardown_progress() return report_path, metrics diff --git a/src/mostlyai/qa/report_from_statistics.py b/src/mostlyai/qa/report_from_statistics.py index eeaf7ff..a7d483d 100644 --- a/src/mostlyai/qa/report_from_statistics.py +++ b/src/mostlyai/qa/report_from_statistics.py @@ -29,7 +29,7 @@ check_statistics_prerequisite, determine_data_size, REPORT_CREDITS, - wrap_progress_callback, + ProgressCallbackWrapper, ) from mostlyai.qa.filesystem import Statistics, TemporaryWorkspace @@ -52,12 +52,10 @@ def report_from_statistics( max_sample_size_embeddings: int | None = None, update_progress: ProgressCallback | None = None, ) -> Path: - with TemporaryWorkspace() as workspace: - update_progress, teardown_progress = wrap_progress_callback( - update_progress, description="Creating report from statistics" - ) - update_progress(completed=0, total=100) - + with ( + TemporaryWorkspace() as workspace, + ProgressCallbackWrapper(update_progress, description="Creating report from statistics") as progress, + ): # prepare report_path if report_path is None: report_path = Path.cwd() / "data-report.html" @@ -75,8 +73,6 @@ def report_from_statistics( check_min_sample_size(syn_sample_size, 100, "synthetic") except PrerequisiteNotMetError: html_report.store_early_exit_report(report_path) - update_progress(completed=100, total=100) - teardown_progress() return report_path meta = statistics.load_meta() @@ -99,7 +95,7 @@ def report_from_statistics( max_sample_size=max_sample_size_accuracy, ) _LOG.info(f"sample synthetic data finished ({syn.shape=})") - update_progress(completed=20, total=100) + progress.update(completed=20, total=100) # calculate and plot accuracy and correlations acc_uni, acc_biv, corr_trn = report_accuracy_and_correlations_from_statistics( @@ -107,7 +103,7 @@ def report_from_statistics( statistics=statistics, workspace=workspace, ) - update_progress(completed=30, total=100) + progress.update(completed=30, total=100) _LOG.info("calculate embeddings for synthetic") syn_embeds = calculate_embeddings( @@ -126,7 +122,7 @@ def report_from_statistics( workspace=workspace, statistics=statistics, ) - update_progress(completed=50, total=100) + progress.update(completed=50, total=100) meta |= { "rows_synthetic": syn.shape[0], @@ -147,8 +143,6 @@ def report_from_statistics( acc_biv=acc_biv, corr_trn=corr_trn, ) - update_progress(completed=100, total=100) - teardown_progress() return report_path From b72496b5a1a638a5b76adccc139ef14f73832c57 Mon Sep 17 00:00:00 2001 From: Michael Platzer Date: Thu, 28 Nov 2024 16:26:44 +0100 Subject: [PATCH 08/12] refined wording + ensure 100% bars --- src/mostlyai/qa/common.py | 1 + src/mostlyai/qa/report.py | 19 +++++++++++-------- src/mostlyai/qa/report_from_statistics.py | 3 ++- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/mostlyai/qa/common.py b/src/mostlyai/qa/common.py index 4d9e2dc..1f7b878 100644 --- a/src/mostlyai/qa/common.py +++ b/src/mostlyai/qa/common.py @@ -94,6 +94,7 @@ def _wrap_progress_callback( def teardown_progress(): if rich_progress: + rich_progress.refresh() rich_progress.stop() return update_progress, teardown_progress diff --git a/src/mostlyai/qa/report.py b/src/mostlyai/qa/report.py index 9514181..4405cad 100644 --- a/src/mostlyai/qa/report.py +++ b/src/mostlyai/qa/report.py @@ -121,7 +121,7 @@ def report( with ( TemporaryWorkspace() as workspace, - ProgressCallbackWrapper(update_progress, description="Creating report") as progress, + ProgressCallbackWrapper(update_progress, description="Create report") as progress, ): # ensure all columns are present and in the same order as training data syn_tgt_data = syn_tgt_data[trn_tgt_data.columns] @@ -231,7 +231,9 @@ def report( hol_sample_size or float("inf"), ) - def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, stop: int) -> np.ndarray: + def _calc_pull_embeds( + df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, progress_from: int, progress_to: int + ) -> np.ndarray: strings = pull_data_for_embeddings( df_tgt=df_tgt, df_ctx=df_ctx, @@ -240,21 +242,21 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st max_sample_size=max_sample_size_embeddings, ) # split into buckets for calculating embeddings to avoid memory issues and report continuous progress - buckets = np.array_split(strings, stop - start) + buckets = np.array_split(strings, progress_to - progress_from) buckets = [b for b in buckets if len(b) > 0] embeds = [] for i, bucket in enumerate(buckets, 1): embeds += [calculate_embeddings(bucket.tolist())] - progress.update(completed=start + i, total=100) - progress.update(completed=stop, total=100) + progress.update(completed=progress_from + i, total=100) + progress.update(completed=progress_to, total=100) embeds = np.concatenate(embeds, axis=0) _LOG.info(f"calculated embeddings {embeds.shape}") return embeds - syn_embeds = _calc_pull_embeds(df_tgt=syn_tgt_data, df_ctx=syn_ctx_data, start=20, stop=40) - trn_embeds = _calc_pull_embeds(df_tgt=trn_tgt_data, df_ctx=trn_ctx_data, start=40, stop=60) + syn_embeds = _calc_pull_embeds(df_tgt=syn_tgt_data, df_ctx=syn_ctx_data, progress_from=20, progress_to=40) + trn_embeds = _calc_pull_embeds(df_tgt=trn_tgt_data, df_ctx=trn_ctx_data, progress_from=40, progress_to=60) if hol_tgt_data is not None: - hol_embeds = _calc_pull_embeds(df_tgt=hol_tgt_data, df_ctx=hol_ctx_data, start=60, stop=80) + hol_embeds = _calc_pull_embeds(df_tgt=hol_tgt_data, df_ctx=hol_ctx_data, progress_from=60, progress_to=80) else: hol_embeds = None progress.update(completed=80, total=100) @@ -313,6 +315,7 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st acc_biv=acc_biv, corr_trn=corr_trn, ) + progress.update(completed=100, total=100) return report_path, metrics diff --git a/src/mostlyai/qa/report_from_statistics.py b/src/mostlyai/qa/report_from_statistics.py index a7d483d..8c7a404 100644 --- a/src/mostlyai/qa/report_from_statistics.py +++ b/src/mostlyai/qa/report_from_statistics.py @@ -54,7 +54,7 @@ def report_from_statistics( ) -> Path: with ( TemporaryWorkspace() as workspace, - ProgressCallbackWrapper(update_progress, description="Creating report from statistics") as progress, + ProgressCallbackWrapper(update_progress, description="Create report") as progress, ): # prepare report_path if report_path is None: @@ -143,6 +143,7 @@ def report_from_statistics( acc_biv=acc_biv, corr_trn=corr_trn, ) + progress.update(completed=100, total=100) return report_path From 287e1f83f14a18db37e9bb0261bef67219d8dbbc Mon Sep 17 00:00:00 2001 From: Michael Platzer Date: Thu, 28 Nov 2024 16:43:12 +0100 Subject: [PATCH 09/12] =?UTF-8?q?=20=F0=9F=9A=80=20[skip=20ci]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mostlyai/qa/report.py | 2 +- src/mostlyai/qa/report_from_statistics.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mostlyai/qa/report.py b/src/mostlyai/qa/report.py index 4405cad..b84e328 100644 --- a/src/mostlyai/qa/report.py +++ b/src/mostlyai/qa/report.py @@ -121,7 +121,7 @@ def report( with ( TemporaryWorkspace() as workspace, - ProgressCallbackWrapper(update_progress, description="Create report") as progress, + ProgressCallbackWrapper(update_progress, description="Create report 🚀") as progress, ): # ensure all columns are present and in the same order as training data syn_tgt_data = syn_tgt_data[trn_tgt_data.columns] diff --git a/src/mostlyai/qa/report_from_statistics.py b/src/mostlyai/qa/report_from_statistics.py index 8c7a404..82e609e 100644 --- a/src/mostlyai/qa/report_from_statistics.py +++ b/src/mostlyai/qa/report_from_statistics.py @@ -54,7 +54,7 @@ def report_from_statistics( ) -> Path: with ( TemporaryWorkspace() as workspace, - ProgressCallbackWrapper(update_progress, description="Create report") as progress, + ProgressCallbackWrapper(update_progress, description="Create report 🚀") as progress, ): # prepare report_path if report_path is None: From a32a31eb34a8227a1fe88989c4b862f164bdab73 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 28 Nov 2024 17:24:57 +0100 Subject: [PATCH 10/12] fixes --- src/mostlyai/qa/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mostlyai/qa/common.py b/src/mostlyai/qa/common.py index 1f7b878..be03960 100644 --- a/src/mostlyai/qa/common.py +++ b/src/mostlyai/qa/common.py @@ -112,7 +112,7 @@ def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): - if exc_type is not None: + if exc_type is None: self.update_progress(completed=1, total=1) self.teardown_progress() From ca82f49c3c3974da28c0a775f61cdc259a14deec Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 28 Nov 2024 18:21:59 +0100 Subject: [PATCH 11/12] finish --- src/mostlyai/qa/common.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/mostlyai/qa/common.py b/src/mostlyai/qa/common.py index be03960..f8c7e23 100644 --- a/src/mostlyai/qa/common.py +++ b/src/mostlyai/qa/common.py @@ -74,9 +74,7 @@ class PrerequisiteNotMetError(Exception): class ProgressCallback(Protocol): - def __call__( - self, total: float | None = None, completed: float | None = None, advance: float | None = None, **kwargs - ) -> None: ... + def __call__(self, total: float | None = None, completed: float | None = None, **kwargs) -> None: ... class ProgressCallbackWrapper: @@ -99,10 +97,8 @@ def teardown_progress(): return update_progress, teardown_progress - def update( - self, total: float | None = None, completed: float | None = None, advance: float | None = None, **kwargs - ) -> None: - self.update_progress(total=total, completed=completed, advance=advance, **kwargs) + def update(self, total: float | None = None, completed: float | None = None, **kwargs) -> None: + self.update_progress(total=total, completed=completed, **kwargs) def __init__(self, update_progress: ProgressCallback | None = None, **kwargs): self.update_progress, self.teardown_progress = self._wrap_progress_callback(update_progress, **kwargs) From 62cc3da7713e4abf7617d1d25d073f213f2afc2d Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 28 Nov 2024 18:28:06 +0100 Subject: [PATCH 12/12] finish --- src/mostlyai/qa/common.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mostlyai/qa/common.py b/src/mostlyai/qa/common.py index f8c7e23..ae2ac8c 100644 --- a/src/mostlyai/qa/common.py +++ b/src/mostlyai/qa/common.py @@ -98,19 +98,19 @@ def teardown_progress(): return update_progress, teardown_progress def update(self, total: float | None = None, completed: float | None = None, **kwargs) -> None: - self.update_progress(total=total, completed=completed, **kwargs) + self._update_progress(total=total, completed=completed, **kwargs) def __init__(self, update_progress: ProgressCallback | None = None, **kwargs): - self.update_progress, self.teardown_progress = self._wrap_progress_callback(update_progress, **kwargs) + self._update_progress, self._teardown_progress = self._wrap_progress_callback(update_progress, **kwargs) def __enter__(self): - self.update_progress(completed=0, total=1) + self._update_progress(completed=0, total=1) return self def __exit__(self, exc_type, exc_value, traceback): if exc_type is None: - self.update_progress(completed=1, total=1) - self.teardown_progress() + self._update_progress(completed=1, total=1) + self._teardown_progress() def check_min_sample_size(size: int, min: int, type: str) -> None: