Skip to content

Commit

Permalink
feat(python): Update describe to use new count implementation (po…
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored and mcrumiller committed Dec 12, 2023
1 parent a6483c6 commit 2cc4083
Show file tree
Hide file tree
Showing 9 changed files with 425 additions and 239 deletions.
54 changes: 36 additions & 18 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
from polars.datatypes import (
INTEGER_DTYPES,
N_INFER_DEFAULT,
NUMERIC_DTYPES,
Boolean,
Float64,
Object,
Expand Down Expand Up @@ -4340,7 +4339,7 @@ def describe(
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │
╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
│ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 33 ┆ 3 │
│ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 22 ┆ 3 │
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │
│ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │
│ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │
Expand All @@ -4352,44 +4351,63 @@ def describe(
└────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
"""
# determine metrics and optional/additional percentiles
if not self.columns:
raise TypeError("cannot describe a DataFrame without any columns")

# Determine which columns should get std/mean/percentile statistics
stat_cols = {
c for c, dt in self.schema.items() if dt.is_numeric() or dt == Boolean
}

# Determine metrics and optional/additional percentiles
metrics = ["count", "null_count", "mean", "std", "min"]
percentile_exprs = []
for p in parse_percentiles(percentiles):
percentile_exprs.append(F.all().quantile(p).name.prefix(f"{p}:"))
for c in self.columns:
expr = F.col(c).quantile(p) if c in stat_cols else F.lit(None)
expr = expr.alias(f"{p}:{c}")
percentile_exprs.append(expr)
metrics.append(f"{p:.0%}")
metrics.append("max")

# execute metrics in parallel
mean_exprs = [
(F.col(c).mean() if c in stat_cols else F.lit(None)).alias(f"mean:{c}")
for c in self.columns
]
std_exprs = [
(F.col(c).std() if c in stat_cols else F.lit(None)).alias(f"std:{c}")
for c in self.columns
]

# Calculate metrics in parallel
df_metrics = self.select(
F.all().len().name.prefix("count:"),
F.all().count().name.prefix("count:"),
F.all().null_count().name.prefix("null_count:"),
F.all().mean().name.prefix("mean:"),
F.all().std().name.prefix("std:"),
*mean_exprs,
*std_exprs,
F.all().min().name.prefix("min:"),
*percentile_exprs,
F.all().max().name.prefix("max:"),
).row(0)
)

# reshape wide result
n_cols = len(self.columns)
# Reshape wide result
described = [
df_metrics[(n * n_cols) : (n + 1) * n_cols] for n in range(len(metrics))
df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
for n in range(len(metrics))
]

# cast by column type (numeric/bool -> float), (other -> string)
# Cast by column type (numeric/bool -> float), (other -> string)
summary = dict(zip(self.columns, list(zip(*described))))
num_or_bool = NUMERIC_DTYPES | {Boolean}
for c, tp in self.schema.items():
for c in self.columns:
summary[c] = [ # type: ignore[assignment]
None
if (v is None or isinstance(v, dict))
else (float(v) if tp in num_or_bool else str(v))
else (float(v) if c in stat_cols else str(v))
for v in summary[c]
]

# return results as a frame
df_summary = self.__class__(summary)
# Return results as a DataFrame
df_summary = self._from_dict(summary)
df_summary.insert_column(0, pl.Series("describe", metrics))
return df_summary

Expand Down
97 changes: 51 additions & 46 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1576,8 +1576,8 @@ def describe(
Examples
--------
>>> series_num = pl.Series([1, 2, 3, 4, 5])
>>> series_num.describe()
>>> s = pl.Series([1, 2, 3, 4, 5])
>>> s.describe()
shape: (9, 2)
┌────────────┬──────────┐
│ statistic ┆ value │
Expand All @@ -1595,64 +1595,70 @@ def describe(
│ max ┆ 5.0 │
└────────────┴──────────┘
>>> series_str = pl.Series(["a", "a", None, "b", "c"])
>>> series_str.describe()
Non-numeric data types may not have all statistics available.
>>> s = pl.Series(["a", "a", None, "b", "c"])
>>> s.describe()
shape: (3, 2)
┌────────────┬───────┐
│ statistic ┆ value │
│ --- ┆ --- │
│ str ┆ i64 │
╞════════════╪═══════╡
│ count ┆ 5
│ count ┆ 4
│ null_count ┆ 1 │
│ unique ┆ 4 │
└────────────┴───────┘
"""
stats: dict[str, PythonLiteral | None]
stats_dtype: PolarsDataType

if self.len() == 0:
raise ValueError("Series must contain at least one value")

elif self.dtype.is_numeric():
s = self.cast(Float64)
if self.dtype.is_numeric():
stats_dtype = Float64
stats = {
"count": s.len(),
"null_count": s.null_count(),
"mean": s.mean(),
"std": s.std(),
"min": s.min(),
"count": self.count(),
"null_count": self.null_count(),
"mean": self.mean(),
"std": self.std(),
"min": self.min(),
}
for p in parse_percentiles(percentiles):
stats[f"{p:.0%}"] = s.quantile(p)
stats["max"] = s.max()
stats[f"{p:.0%}"] = self.quantile(p)
stats["max"] = self.max()

elif self.dtype == Boolean:
stats_dtype = Int64
stats = {
"count": self.len(),
"count": self.count(),
"null_count": self.null_count(),
"sum": self.sum(),
}
elif self.dtype == Utf8:
stats_dtype = Int64
stats = {
"count": self.len(),
"count": self.count(),
"null_count": self.null_count(),
"unique": len(self.unique()),
"unique": self.n_unique(),
}
elif self.dtype.is_temporal():
# we coerce all to string, because a polars column
# only has a single dtype and dates: datetime and count: int don't match
stats_dtype = Utf8
stats = {
"count": str(self.len()),
"count": str(self.count()),
"null_count": str(self.null_count()),
"min": str(self.dt.min()),
"50%": str(self.dt.median()),
"max": str(self.dt.max()),
}
else:
raise TypeError("this type is not supported")
raise TypeError(f"cannot describe Series of data type {self.dtype}")

return pl.DataFrame({"statistic": stats.keys(), "value": stats.values()})
return pl.DataFrame(
{"statistic": stats.keys(), "value": stats.values()},
schema={"statistic": Utf8, "value": stats_dtype},
)

def sum(self) -> int | float:
"""
Expand Down Expand Up @@ -1687,7 +1693,7 @@ def mean(self) -> int | float | None:

def product(self) -> int | float:
"""Reduce this Series to the product value."""
return self.to_frame().select(F.col(self.name).product()).to_series().item()
return self._s.product()

def pow(self, exponent: int | float | None | Series) -> Series:
"""
Expand Down Expand Up @@ -1754,7 +1760,8 @@ def nan_max(self) -> int | float | date | datetime | timedelta | str:
whereas polars defaults to ignoring them.
"""
return self.to_frame().select(F.col(self.name).nan_max()).item()
# return self.to_frame().select(F.col(self.name).nan_max()).item()
return self._s.nan_max()

def nan_min(self) -> int | float | date | datetime | timedelta | str:
"""
Expand Down Expand Up @@ -1786,7 +1793,8 @@ def std(self, ddof: int = 1) -> float | None:
"""
if not self.dtype.is_numeric():
return None
return self.to_frame().select(F.col(self.name).std(ddof)).to_series().item()
# return self.to_frame().select(F.col(self.name).std(ddof)).to_series().item()
return self._s.std(ddof)

def var(self, ddof: int = 1) -> float | None:
"""
Expand All @@ -1808,7 +1816,7 @@ def var(self, ddof: int = 1) -> float | None:
"""
if not self.dtype.is_numeric():
return None
return self.to_frame().select(F.col(self.name).var(ddof)).to_series().item()
return self._s.var(ddof)

def median(self) -> float | None:
"""
Expand Down Expand Up @@ -2447,10 +2455,8 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFra
│ green ┆ 1 │
└───────┴───────┘
"""
return (
self.to_frame()
.select(F.col(self.name).value_counts(sort=sort, parallel=parallel))
.unnest(self.name)
return pl.DataFrame._from_pydf(
self._s.value_counts(sort=sort, parallel=parallel)
)

def unique_counts(self) -> Series:
Expand Down Expand Up @@ -2494,12 +2500,7 @@ def entropy(self, base: float = math.e, *, normalize: bool = False) -> float | N
0.8568409950394724
"""
return (
self.to_frame()
.select(F.col(self.name).entropy(base, normalize=normalize))
.to_series()
.item()
)
return self._s.entropy(base=base, normalize=normalize)

def cumulative_eval(
self, expr: Expr, min_periods: int = 1, *, parallel: bool = False
Expand Down Expand Up @@ -2766,6 +2767,7 @@ def slice(self, offset: int, length: int | None = None) -> Series:
]
"""
return self._from_pyseries(self._s.slice(offset=offset, length=length))

def append(self, other: Series) -> Self:
"""
Expand Down Expand Up @@ -3934,16 +3936,19 @@ def is_between(
]
"""
if isinstance(lower_bound, str):
lower_bound = F.lit(lower_bound)
if isinstance(upper_bound, str):
upper_bound = F.lit(upper_bound)
if closed == "none":
out = (self > lower_bound) & (self < upper_bound)
elif closed == "both":
out = (self >= lower_bound) & (self <= upper_bound)
elif closed == "right":
out = (self > lower_bound) & (self <= upper_bound)
elif closed == "left":
out = (self >= lower_bound) & (self < upper_bound)

return (
self.to_frame()
.select(F.col(self.name).is_between(lower_bound, upper_bound, closed))
.to_series()
)
if isinstance(out, pl.Expr):
out = F.select(out).to_series()

return out

def to_numpy(
self,
Expand Down
67 changes: 67 additions & 0 deletions py-polars/src/series/aggregation.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use polars_ops::chunked_array::nan_propagating_aggregate::*;
use pyo3::prelude::*;

use crate::error::PyPolarsErr;
Expand All @@ -6,6 +7,20 @@ use crate::PySeries;

#[pymethods]
impl PySeries {
fn any(&self) -> Option<bool> {
match self.series.dtype() {
DataType::Boolean => Some(self.series.bool().unwrap().any()),
_ => None,
}
}

fn all(&self) -> Option<bool> {
match self.series.dtype() {
DataType::Boolean => Some(self.series.bool().unwrap().all()),
_ => None,
}
}

fn arg_max(&self) -> Option<usize> {
self.series.arg_max()
}
Expand Down Expand Up @@ -54,6 +69,34 @@ impl PySeries {
.into_py(py))
}

fn nan_min(&self, py: Python) -> PyResult<PyObject> {
match self.series.dtype() {
DataType::Float32 | DataType::Float64 => Ok(Wrap(
nan_min_s(&self.series, self.name())
.get(0)
.map_err(PyPolarsErr::from)?,
)
.into_py(py)),
_ => self.min(py),
}
}

fn nan_max(&self, py: Python) -> PyResult<PyObject> {
match self.series.dtype() {
DataType::Float32 | DataType::Float64 => Ok(Wrap(
nan_max_s(&self.series, self.name())
.get(0)
.map_err(PyPolarsErr::from)?,
)
.into_py(py)),
_ => self.max(py),
}
}

fn product(&self, py: Python) -> PyResult<PyObject> {
Ok(Wrap(self.series.product().get(0).map_err(PyPolarsErr::from)?).into_py(py))
}

fn quantile(&self, quantile: f64, interpolation: Wrap<QuantileInterpolOptions>) -> PyObject {
Python::with_gil(|py| {
Wrap(
Expand All @@ -67,6 +110,26 @@ impl PySeries {
})
}

fn std(&self, py: Python, ddof: u8) -> PyResult<PyObject> {
Ok(Wrap(
self.series
.std_as_series(ddof)
.get(0)
.map_err(PyPolarsErr::from)?,
)
.into_py(py))
}

fn var(&self, py: Python, ddof: u8) -> PyResult<PyObject> {
Ok(Wrap(
self.series
.var_as_series(ddof)
.get(0)
.map_err(PyPolarsErr::from)?,
)
.into_py(py))
}

fn sum(&self, py: Python) -> PyResult<PyObject> {
Ok(Wrap(
self.series
Expand All @@ -76,4 +139,8 @@ impl PySeries {
)
.into_py(py))
}

fn entropy(&self, base: f64, normalize: bool) -> Option<f64> {
self.series.entropy(base, normalize)
}
}
Loading

0 comments on commit 2cc4083

Please sign in to comment.