feat(python): Update describe to use new count implementation (po…

…la-rs#12990)
mcrumiller · Dec 12, 2023 · 2cc4083 · 2cc4083
1 parent a6483c6
commit 2cc4083
Show file tree

Hide file tree

Showing 9 changed files with 425 additions and 239 deletions.
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -36,7 +36,6 @@
 from polars.datatypes import (
     INTEGER_DTYPES,
     N_INFER_DEFAULT,
-    NUMERIC_DTYPES,
     Boolean,
     Float64,
     Object,
@@ -4340,7 +4339,7 @@ def describe(
         │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---  ┆ ---        │
         │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str  ┆ str        │
         ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
-        │ count      ┆ 3.0      ┆ 3.0      ┆ 3.0      ┆ 3    ┆ 3    ┆ 3          │
+        │ count      ┆ 3.0      ┆ 2.0      ┆ 3.0      ┆ 2    ┆ 2    ┆ 3          │
         │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 1    ┆ 1    ┆ 0          │
         │ mean       ┆ 2.266667 ┆ 4.5      ┆ 0.666667 ┆ null ┆ null ┆ null       │
         │ std        ┆ 1.101514 ┆ 0.707107 ┆ 0.57735  ┆ null ┆ null ┆ null       │
@@ -4352,44 +4351,63 @@ def describe(
         └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
 
         """
-        # determine metrics and optional/additional percentiles
+        if not self.columns:
+            raise TypeError("cannot describe a DataFrame without any columns")
+
+        # Determine which columns should get std/mean/percentile statistics
+        stat_cols = {
+            c for c, dt in self.schema.items() if dt.is_numeric() or dt == Boolean
+        }
+
+        # Determine metrics and optional/additional percentiles
         metrics = ["count", "null_count", "mean", "std", "min"]
         percentile_exprs = []
         for p in parse_percentiles(percentiles):
-            percentile_exprs.append(F.all().quantile(p).name.prefix(f"{p}:"))
+            for c in self.columns:
+                expr = F.col(c).quantile(p) if c in stat_cols else F.lit(None)
+                expr = expr.alias(f"{p}:{c}")
+                percentile_exprs.append(expr)
             metrics.append(f"{p:.0%}")
         metrics.append("max")
 
-        # execute metrics in parallel
+        mean_exprs = [
+            (F.col(c).mean() if c in stat_cols else F.lit(None)).alias(f"mean:{c}")
+            for c in self.columns
+        ]
+        std_exprs = [
+            (F.col(c).std() if c in stat_cols else F.lit(None)).alias(f"std:{c}")
+            for c in self.columns
+        ]
+
+        # Calculate metrics in parallel
         df_metrics = self.select(
-            F.all().len().name.prefix("count:"),
+            F.all().count().name.prefix("count:"),
             F.all().null_count().name.prefix("null_count:"),
-            F.all().mean().name.prefix("mean:"),
-            F.all().std().name.prefix("std:"),
+            *mean_exprs,
+            *std_exprs,
             F.all().min().name.prefix("min:"),
             *percentile_exprs,
             F.all().max().name.prefix("max:"),
-        ).row(0)
+        )
 
-        # reshape wide result
-        n_cols = len(self.columns)
+        # Reshape wide result
         described = [
-            df_metrics[(n * n_cols) : (n + 1) * n_cols] for n in range(len(metrics))
+            df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
+            for n in range(len(metrics))
         ]
 
-        # cast by column type (numeric/bool -> float), (other -> string)
+        # Cast by column type (numeric/bool -> float), (other -> string)
         summary = dict(zip(self.columns, list(zip(*described))))
-        num_or_bool = NUMERIC_DTYPES | {Boolean}
-        for c, tp in self.schema.items():
+        for c in self.columns:
             summary[c] = [  # type: ignore[assignment]
                 None
                 if (v is None or isinstance(v, dict))
-                else (float(v) if tp in num_or_bool else str(v))
+                else (float(v) if c in stat_cols else str(v))
                 for v in summary[c]
             ]
 
-        # return results as a frame
-        df_summary = self.__class__(summary)
+        # Return results as a DataFrame
+        df_summary = self._from_dict(summary)
         df_summary.insert_column(0, pl.Series("describe", metrics))
         return df_summary
 

diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
@@ -1576,8 +1576,8 @@ def describe(
 
         Examples
         --------
-        >>> series_num = pl.Series([1, 2, 3, 4, 5])
-        >>> series_num.describe()
+        >>> s = pl.Series([1, 2, 3, 4, 5])
+        >>> s.describe()
         shape: (9, 2)
         ┌────────────┬──────────┐
         │ statistic  ┆ value    │
@@ -1595,64 +1595,70 @@ def describe(
         │ max        ┆ 5.0      │
         └────────────┴──────────┘
 
-        >>> series_str = pl.Series(["a", "a", None, "b", "c"])
-        >>> series_str.describe()
+        Non-numeric data types may not have all statistics available.
+
+        >>> s = pl.Series(["a", "a", None, "b", "c"])
+        >>> s.describe()
         shape: (3, 2)
         ┌────────────┬───────┐
         │ statistic  ┆ value │
         │ ---        ┆ ---   │
         │ str        ┆ i64   │
         ╞════════════╪═══════╡
-        │ count      ┆ 5     │
+        │ count      ┆ 4     │
         │ null_count ┆ 1     │
         │ unique     ┆ 4     │
         └────────────┴───────┘
 
         """
         stats: dict[str, PythonLiteral | None]
+        stats_dtype: PolarsDataType
 
-        if self.len() == 0:
-            raise ValueError("Series must contain at least one value")
-
-        elif self.dtype.is_numeric():
-            s = self.cast(Float64)
+        if self.dtype.is_numeric():
+            stats_dtype = Float64
             stats = {
-                "count": s.len(),
-                "null_count": s.null_count(),
-                "mean": s.mean(),
-                "std": s.std(),
-                "min": s.min(),
+                "count": self.count(),
+                "null_count": self.null_count(),
+                "mean": self.mean(),
+                "std": self.std(),
+                "min": self.min(),
             }
             for p in parse_percentiles(percentiles):
-                stats[f"{p:.0%}"] = s.quantile(p)
-            stats["max"] = s.max()
+                stats[f"{p:.0%}"] = self.quantile(p)
+            stats["max"] = self.max()
 
         elif self.dtype == Boolean:
+            stats_dtype = Int64
             stats = {
-                "count": self.len(),
+                "count": self.count(),
                 "null_count": self.null_count(),
                 "sum": self.sum(),
             }
         elif self.dtype == Utf8:
+            stats_dtype = Int64
             stats = {
-                "count": self.len(),
+                "count": self.count(),
                 "null_count": self.null_count(),
-                "unique": len(self.unique()),
+                "unique": self.n_unique(),
             }
         elif self.dtype.is_temporal():
             # we coerce all to string, because a polars column
             # only has a single dtype and dates: datetime and count: int don't match
+            stats_dtype = Utf8
             stats = {
-                "count": str(self.len()),
+                "count": str(self.count()),
                 "null_count": str(self.null_count()),
                 "min": str(self.dt.min()),
                 "50%": str(self.dt.median()),
                 "max": str(self.dt.max()),
             }
         else:
-            raise TypeError("this type is not supported")
+            raise TypeError(f"cannot describe Series of data type {self.dtype}")
 
-        return pl.DataFrame({"statistic": stats.keys(), "value": stats.values()})
+        return pl.DataFrame(
+            {"statistic": stats.keys(), "value": stats.values()},
+            schema={"statistic": Utf8, "value": stats_dtype},
+        )
 
     def sum(self) -> int | float:
         """
@@ -1687,7 +1693,7 @@ def mean(self) -> int | float | None:
 
     def product(self) -> int | float:
         """Reduce this Series to the product value."""
-        return self.to_frame().select(F.col(self.name).product()).to_series().item()
+        return self._s.product()
 
     def pow(self, exponent: int | float | None | Series) -> Series:
         """
@@ -1754,7 +1760,8 @@ def nan_max(self) -> int | float | date | datetime | timedelta | str:
         whereas polars defaults to ignoring them.
 
         """
-        return self.to_frame().select(F.col(self.name).nan_max()).item()
+        # return self.to_frame().select(F.col(self.name).nan_max()).item()
+        return self._s.nan_max()
 
     def nan_min(self) -> int | float | date | datetime | timedelta | str:
         """
@@ -1786,7 +1793,8 @@ def std(self, ddof: int = 1) -> float | None:
         """
         if not self.dtype.is_numeric():
             return None
-        return self.to_frame().select(F.col(self.name).std(ddof)).to_series().item()
+        # return self.to_frame().select(F.col(self.name).std(ddof)).to_series().item()
+        return self._s.std(ddof)
 
     def var(self, ddof: int = 1) -> float | None:
         """
@@ -1808,7 +1816,7 @@ def var(self, ddof: int = 1) -> float | None:
         """
         if not self.dtype.is_numeric():
             return None
-        return self.to_frame().select(F.col(self.name).var(ddof)).to_series().item()
+        return self._s.var(ddof)
 
     def median(self) -> float | None:
         """
@@ -2447,10 +2455,8 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFra
         │ green ┆ 1     │
         └───────┴───────┘
         """
-        return (
-            self.to_frame()
-            .select(F.col(self.name).value_counts(sort=sort, parallel=parallel))
-            .unnest(self.name)
+        return pl.DataFrame._from_pydf(
+            self._s.value_counts(sort=sort, parallel=parallel)
         )
 
     def unique_counts(self) -> Series:
@@ -2494,12 +2500,7 @@ def entropy(self, base: float = math.e, *, normalize: bool = False) -> float | N
         0.8568409950394724
 
         """
-        return (
-            self.to_frame()
-            .select(F.col(self.name).entropy(base, normalize=normalize))
-            .to_series()
-            .item()
-        )
+        return self._s.entropy(base=base, normalize=normalize)
 
     def cumulative_eval(
         self, expr: Expr, min_periods: int = 1, *, parallel: bool = False
@@ -2766,6 +2767,7 @@ def slice(self, offset: int, length: int | None = None) -> Series:
         ]
 
         """
+        return self._from_pyseries(self._s.slice(offset=offset, length=length))
 
     def append(self, other: Series) -> Self:
         """
@@ -3934,16 +3936,19 @@ def is_between(
         ]
 
         """
-        if isinstance(lower_bound, str):
-            lower_bound = F.lit(lower_bound)
-        if isinstance(upper_bound, str):
-            upper_bound = F.lit(upper_bound)
+        if closed == "none":
+            out = (self > lower_bound) & (self < upper_bound)
+        elif closed == "both":
+            out = (self >= lower_bound) & (self <= upper_bound)
+        elif closed == "right":
+            out = (self > lower_bound) & (self <= upper_bound)
+        elif closed == "left":
+            out = (self >= lower_bound) & (self < upper_bound)
 
-        return (
-            self.to_frame()
-            .select(F.col(self.name).is_between(lower_bound, upper_bound, closed))
-            .to_series()
-        )
+        if isinstance(out, pl.Expr):
+            out = F.select(out).to_series()
+
+        return out
 
     def to_numpy(
         self,

diff --git a/py-polars/src/series/aggregation.rs b/py-polars/src/series/aggregation.rs
@@ -1,3 +1,4 @@
+use polars_ops::chunked_array::nan_propagating_aggregate::*;
 use pyo3::prelude::*;
 
 use crate::error::PyPolarsErr;
@@ -6,6 +7,20 @@ use crate::PySeries;
 
 #[pymethods]
 impl PySeries {
+    fn any(&self) -> Option<bool> {
+        match self.series.dtype() {
+            DataType::Boolean => Some(self.series.bool().unwrap().any()),
+            _ => None,
+        }
+    }
+
+    fn all(&self) -> Option<bool> {
+        match self.series.dtype() {
+            DataType::Boolean => Some(self.series.bool().unwrap().all()),
+            _ => None,
+        }
+    }
+
     fn arg_max(&self) -> Option<usize> {
         self.series.arg_max()
     }
@@ -54,6 +69,34 @@ impl PySeries {
         .into_py(py))
     }
 
+    fn nan_min(&self, py: Python) -> PyResult<PyObject> {
+        match self.series.dtype() {
+            DataType::Float32 | DataType::Float64 => Ok(Wrap(
+                nan_min_s(&self.series, self.name())
+                    .get(0)
+                    .map_err(PyPolarsErr::from)?,
+            )
+            .into_py(py)),
+            _ => self.min(py),
+        }
+    }
+
+    fn nan_max(&self, py: Python) -> PyResult<PyObject> {
+        match self.series.dtype() {
+            DataType::Float32 | DataType::Float64 => Ok(Wrap(
+                nan_max_s(&self.series, self.name())
+                    .get(0)
+                    .map_err(PyPolarsErr::from)?,
+            )
+            .into_py(py)),
+            _ => self.max(py),
+        }
+    }
+
+    fn product(&self, py: Python) -> PyResult<PyObject> {
+        Ok(Wrap(self.series.product().get(0).map_err(PyPolarsErr::from)?).into_py(py))
+    }
+
     fn quantile(&self, quantile: f64, interpolation: Wrap<QuantileInterpolOptions>) -> PyObject {
         Python::with_gil(|py| {
             Wrap(
@@ -67,6 +110,26 @@ impl PySeries {
         })
     }
 
+    fn std(&self, py: Python, ddof: u8) -> PyResult<PyObject> {
+        Ok(Wrap(
+            self.series
+                .std_as_series(ddof)
+                .get(0)
+                .map_err(PyPolarsErr::from)?,
+        )
+        .into_py(py))
+    }
+
+    fn var(&self, py: Python, ddof: u8) -> PyResult<PyObject> {
+        Ok(Wrap(
+            self.series
+                .var_as_series(ddof)
+                .get(0)
+                .map_err(PyPolarsErr::from)?,
+        )
+        .into_py(py))
+    }
+
     fn sum(&self, py: Python) -> PyResult<PyObject> {
         Ok(Wrap(
             self.series
@@ -76,4 +139,8 @@ impl PySeries {
         )
         .into_py(py))
     }
+
+    fn entropy(&self, base: f64, normalize: bool) -> Option<f64> {
+        self.series.entropy(base, normalize)
+    }
 }