In [1]:
test = "test"
test 

'test'

In [None]:
def _analyze_and_report(self, lf: pl.LazyFrame, file_label: str):
        """
        Analyze a LazyFrame without materializing the full dataset:
          - count rows with any NULL
          - drop rows with NULLs (lazy)
          - collect dtypes (from schema)
          - compute rows after cleaning
        Returns: (summary_dict, cleaned_lazyframe)
        """
        # Count rows that have at least one NULL
        # any_horizontal across all columns → boolean per row → sum True's
        missing_expr = pl.any_horizontal(pl.all().is_null()).sum().alias("rows_with_missing")
        rows_with_missing = (
            lf.select(missing_expr)
              .collect(streaming=True)["rows_with_missing"][0]
        )
        logging.info(f"{file_label}: Rows with missing values: {rows_with_missing}")

        # Drop rows with any NULLs (still lazy)
        lf_clean = lf.drop_nulls()

        # Rows after dropping NULLs
        rows_after_clean = (
            lf_clean.select(pl.len().alias("n"))
                    .collect(streaming=True)["n"][0]
        )
        # Column count is known without collect
        col_count = len(lf_clean.columns)

        logging.info(f"{file_label}: Shape after dropping missing rows: ({rows_after_clean}, {col_count})")

        # Column types from schema (no collect)
        columns_info = {name: str(dtype) for name, dtype in lf_clean.schema.items()}

        summary = {
            "rows_with_missing": int(rows_with_missing),
            "final_shape": [int(rows_after_clean), int(col_count)],
            "columns": columns_info,
        }
        return summary, lf_clean