From 3913730c71c84e10ba8d58c4cced2b92cb558da4 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Tue, 21 Jan 2025 16:50:11 +0100 Subject: [PATCH 1/3] wip --- mostlyai/qa/_sampling.py | 7 ++----- mostlyai/qa/reporting_from_statistics.py | 3 +++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mostlyai/qa/_sampling.py b/mostlyai/qa/_sampling.py index 4665a5a..609af87 100644 --- a/mostlyai/qa/_sampling.py +++ b/mostlyai/qa/_sampling.py @@ -54,7 +54,7 @@ def pull_data_for_accuracy( ctx_primary_key: str | None = None, tgt_context_key: str | None = None, max_sample_size: int | None = None, - setup: str | None = None, + setup: str, ) -> pd.DataFrame: """ Prepare single dataset for accuracy report. @@ -64,7 +64,7 @@ def pull_data_for_accuracy( assert df_ctx is None or (ctx_primary_key is not None and tgt_context_key is not None) assert tgt_context_key is None or tgt_context_key in df_tgt.columns assert ctx_primary_key is None or ctx_primary_key in df_ctx.columns - assert setup is None or setup in ["1:1", "1:N"] + assert setup in ["1:1", "1:N"] key = "__KEY" @@ -116,9 +116,6 @@ def pull_data_for_accuracy( df[count_column] = df[count_column].fillna(0).astype("Int64") df = df.loc[df[count_column] > 0].reset_index(drop=True) - if setup is None: - setup = "1:1" if (df[count_column] == 1).all() else "1:N" - # for 1:1 ctx/tgt setups, drop nxt and count columns; ensure at least one column remains if setup == "1:1": df = df.drop(columns=[c for c in df.columns if c.startswith(NXT_COLUMN_PREFIX)]) diff --git a/mostlyai/qa/reporting_from_statistics.py b/mostlyai/qa/reporting_from_statistics.py index 2883fbb..a03d6cc 100644 --- a/mostlyai/qa/reporting_from_statistics.py +++ b/mostlyai/qa/reporting_from_statistics.py @@ -113,6 +113,9 @@ def report_from_statistics( ctx_primary_key=ctx_primary_key, tgt_context_key=tgt_context_key, max_sample_size=max_sample_size_accuracy, + # always pull Sequence Length and nxt columns for synthetic data + # and let downstream functions decide if they are needed + setup="1:N", ) _LOG.info(f"sample synthetic data finished ({syn.shape=})") progress.update(completed=20, total=100) From c873d66a4aebdd0f724a56bc54c614d0bb6afb34 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Tue, 21 Jan 2025 16:55:27 +0100 Subject: [PATCH 2/3] wip --- mostlyai/qa/_sampling.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mostlyai/qa/_sampling.py b/mostlyai/qa/_sampling.py index 609af87..33f3447 100644 --- a/mostlyai/qa/_sampling.py +++ b/mostlyai/qa/_sampling.py @@ -54,7 +54,7 @@ def pull_data_for_accuracy( ctx_primary_key: str | None = None, tgt_context_key: str | None = None, max_sample_size: int | None = None, - setup: str, + setup: str | None = None, ) -> pd.DataFrame: """ Prepare single dataset for accuracy report. @@ -110,10 +110,14 @@ def pull_data_for_accuracy( df = pd.merge(df, df_tgt, on=key, how="left") df = pd.merge(df, df_nxt, on=key, how="left") df = df.drop(columns=[key]) - - # remove records with sequence length equal to 0 count_column = f"{TGT_COLUMN_PREFIX}{COUNT_COLUMN}" df[count_column] = df[count_column].fillna(0).astype("Int64") + + # determine setup if not provided + if setup is None: + setup = "1:1" if (df[count_column] == 1).all() else "1:N" + + # remove records with sequence length equal to 0 df = df.loc[df[count_column] > 0].reset_index(drop=True) # for 1:1 ctx/tgt setups, drop nxt and count columns; ensure at least one column remains From 8feee75b9b8aa45b645f46020c8621450ac143bb Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Tue, 21 Jan 2025 16:55:56 +0100 Subject: [PATCH 3/3] wip --- mostlyai/qa/_sampling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mostlyai/qa/_sampling.py b/mostlyai/qa/_sampling.py index 33f3447..ccd7389 100644 --- a/mostlyai/qa/_sampling.py +++ b/mostlyai/qa/_sampling.py @@ -64,7 +64,7 @@ def pull_data_for_accuracy( assert df_ctx is None or (ctx_primary_key is not None and tgt_context_key is not None) assert tgt_context_key is None or tgt_context_key in df_tgt.columns assert ctx_primary_key is None or ctx_primary_key in df_ctx.columns - assert setup in ["1:1", "1:N"] + assert setup is None or setup in ["1:1", "1:N"] key = "__KEY"