From 3913730c71c84e10ba8d58c4cced2b92cb558da4 Mon Sep 17 00:00:00 2001
From: Lukasz Kolodziejczyk <lukasz.kolodziejczyk@mostly.ai>
Date: Tue, 21 Jan 2025 16:50:11 +0100
Subject: [PATCH 1/3] wip

---
 mostlyai/qa/_sampling.py                 | 7 ++-----
 mostlyai/qa/reporting_from_statistics.py | 3 +++
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mostlyai/qa/_sampling.py b/mostlyai/qa/_sampling.py
index 4665a5a..609af87 100644
--- a/mostlyai/qa/_sampling.py
+++ b/mostlyai/qa/_sampling.py
@@ -54,7 +54,7 @@ def pull_data_for_accuracy(
     ctx_primary_key: str | None = None,
     tgt_context_key: str | None = None,
     max_sample_size: int | None = None,
-    setup: str | None = None,
+    setup: str,
 ) -> pd.DataFrame:
     """
     Prepare single dataset for accuracy report.
@@ -64,7 +64,7 @@ def pull_data_for_accuracy(
     assert df_ctx is None or (ctx_primary_key is not None and tgt_context_key is not None)
     assert tgt_context_key is None or tgt_context_key in df_tgt.columns
     assert ctx_primary_key is None or ctx_primary_key in df_ctx.columns
-    assert setup is None or setup in ["1:1", "1:N"]
+    assert setup in ["1:1", "1:N"]
 
     key = "__KEY"
 
@@ -116,9 +116,6 @@ def pull_data_for_accuracy(
     df[count_column] = df[count_column].fillna(0).astype("Int64")
     df = df.loc[df[count_column] > 0].reset_index(drop=True)
 
-    if setup is None:
-        setup = "1:1" if (df[count_column] == 1).all() else "1:N"
-
     # for 1:1 ctx/tgt setups, drop nxt and count columns; ensure at least one column remains
     if setup == "1:1":
         df = df.drop(columns=[c for c in df.columns if c.startswith(NXT_COLUMN_PREFIX)])
diff --git a/mostlyai/qa/reporting_from_statistics.py b/mostlyai/qa/reporting_from_statistics.py
index 2883fbb..a03d6cc 100644
--- a/mostlyai/qa/reporting_from_statistics.py
+++ b/mostlyai/qa/reporting_from_statistics.py
@@ -113,6 +113,9 @@ def report_from_statistics(
             ctx_primary_key=ctx_primary_key,
             tgt_context_key=tgt_context_key,
             max_sample_size=max_sample_size_accuracy,
+            # always pull Sequence Length and nxt columns for synthetic data
+            # and let downstream functions decide if they are needed
+            setup="1:N",
         )
         _LOG.info(f"sample synthetic data finished ({syn.shape=})")
         progress.update(completed=20, total=100)

From c873d66a4aebdd0f724a56bc54c614d0bb6afb34 Mon Sep 17 00:00:00 2001
From: Lukasz Kolodziejczyk <lukasz.kolodziejczyk@mostly.ai>
Date: Tue, 21 Jan 2025 16:55:27 +0100
Subject: [PATCH 2/3] wip

---
 mostlyai/qa/_sampling.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/mostlyai/qa/_sampling.py b/mostlyai/qa/_sampling.py
index 609af87..33f3447 100644
--- a/mostlyai/qa/_sampling.py
+++ b/mostlyai/qa/_sampling.py
@@ -54,7 +54,7 @@ def pull_data_for_accuracy(
     ctx_primary_key: str | None = None,
     tgt_context_key: str | None = None,
     max_sample_size: int | None = None,
-    setup: str,
+    setup: str | None = None,
 ) -> pd.DataFrame:
     """
     Prepare single dataset for accuracy report.
@@ -110,10 +110,14 @@ def pull_data_for_accuracy(
     df = pd.merge(df, df_tgt, on=key, how="left")
     df = pd.merge(df, df_nxt, on=key, how="left")
     df = df.drop(columns=[key])
-
-    # remove records with sequence length equal to 0
     count_column = f"{TGT_COLUMN_PREFIX}{COUNT_COLUMN}"
     df[count_column] = df[count_column].fillna(0).astype("Int64")
+
+    # determine setup if not provided
+    if setup is None:
+        setup = "1:1" if (df[count_column] == 1).all() else "1:N"
+
+    # remove records with sequence length equal to 0
     df = df.loc[df[count_column] > 0].reset_index(drop=True)
 
     # for 1:1 ctx/tgt setups, drop nxt and count columns; ensure at least one column remains

From 8feee75b9b8aa45b645f46020c8621450ac143bb Mon Sep 17 00:00:00 2001
From: Lukasz Kolodziejczyk <lukasz.kolodziejczyk@mostly.ai>
Date: Tue, 21 Jan 2025 16:55:56 +0100
Subject: [PATCH 3/3] wip

---
 mostlyai/qa/_sampling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mostlyai/qa/_sampling.py b/mostlyai/qa/_sampling.py
index 33f3447..ccd7389 100644
--- a/mostlyai/qa/_sampling.py
+++ b/mostlyai/qa/_sampling.py
@@ -64,7 +64,7 @@ def pull_data_for_accuracy(
     assert df_ctx is None or (ctx_primary_key is not None and tgt_context_key is not None)
     assert tgt_context_key is None or tgt_context_key in df_tgt.columns
     assert ctx_primary_key is None or ctx_primary_key in df_ctx.columns
-    assert setup in ["1:1", "1:N"]
+    assert setup is None or setup in ["1:1", "1:N"]
 
     key = "__KEY"