diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 273a40dd526cf..1e9c657cf81b3 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -802,11 +802,11 @@ def groupBy(self, *cols): Each element should be a column name (string) or an expression (:class:`Column`). >>> df.groupBy().avg().collect() - [Row(AVG(age)=3.5)] + [Row(avg(age)=3.5)] >>> df.groupBy('name').agg({'age': 'mean'}).collect() - [Row(name=u'Alice', AVG(age)=2.0), Row(name=u'Bob', AVG(age)=5.0)] + [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)] >>> df.groupBy(df.name).avg().collect() - [Row(name=u'Alice', AVG(age)=2.0), Row(name=u'Bob', AVG(age)=5.0)] + [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)] >>> df.groupBy(['name', df.age]).count().collect() [Row(name=u'Bob', age=5, count=1), Row(name=u'Alice', age=2, count=1)] """ @@ -864,10 +864,10 @@ def agg(self, *exprs): (shorthand for ``df.groupBy.agg()``). >>> df.agg({"age": "max"}).collect() - [Row(MAX(age)=5)] + [Row(max(age)=5)] >>> from pyspark.sql import functions as F >>> df.agg(F.min(df.age)).collect() - [Row(MIN(age)=2)] + [Row(min(age)=2)] """ return self.groupBy().agg(*exprs) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 4e2be88e9e3b9..f9a15d4a66309 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -266,7 +266,7 @@ def coalesce(*cols): >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show() +-------------+ - |Coalesce(a,b)| + |coalesce(a,b)| +-------------+ | null| | 1| @@ -275,7 +275,7 @@ def coalesce(*cols): >>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show() +----+----+---------------+ - | a| b|Coalesce(a,0.0)| + | a| b|coalesce(a,0.0)| +----+----+---------------+ |null|null| 0.0| | 1|null| 1.0| diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py index 5a37a673ee80c..04594d5a836ce 100644 --- a/python/pyspark/sql/group.py +++ b/python/pyspark/sql/group.py @@ -75,11 +75,11 @@ def agg(self, *exprs): >>> gdf = df.groupBy(df.name) >>> gdf.agg({"*": "count"}).collect() - [Row(name=u'Alice', COUNT(1)=1), Row(name=u'Bob', COUNT(1)=1)] + [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)] >>> from pyspark.sql import functions as F >>> gdf.agg(F.min(df.age)).collect() - [Row(name=u'Alice', MIN(age)=2), Row(name=u'Bob', MIN(age)=5)] + [Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)] """ assert exprs, "exprs should not be empty" if len(exprs) == 1 and isinstance(exprs[0], dict): @@ -110,9 +110,9 @@ def mean(self, *cols): :param cols: list of column names (string). Non-numeric columns are ignored. >>> df.groupBy().mean('age').collect() - [Row(AVG(age)=3.5)] + [Row(avg(age)=3.5)] >>> df3.groupBy().mean('age', 'height').collect() - [Row(AVG(age)=3.5, AVG(height)=82.5)] + [Row(avg(age)=3.5, avg(height)=82.5)] """ @df_varargs_api @@ -125,9 +125,9 @@ def avg(self, *cols): :param cols: list of column names (string). Non-numeric columns are ignored. >>> df.groupBy().avg('age').collect() - [Row(AVG(age)=3.5)] + [Row(avg(age)=3.5)] >>> df3.groupBy().avg('age', 'height').collect() - [Row(AVG(age)=3.5, AVG(height)=82.5)] + [Row(avg(age)=3.5, avg(height)=82.5)] """ @df_varargs_api @@ -136,9 +136,9 @@ def max(self, *cols): """Computes the max value for each numeric columns for each group. >>> df.groupBy().max('age').collect() - [Row(MAX(age)=5)] + [Row(max(age)=5)] >>> df3.groupBy().max('age', 'height').collect() - [Row(MAX(age)=5, MAX(height)=85)] + [Row(max(age)=5, max(height)=85)] """ @df_varargs_api @@ -149,9 +149,9 @@ def min(self, *cols): :param cols: list of column names (string). Non-numeric columns are ignored. >>> df.groupBy().min('age').collect() - [Row(MIN(age)=2)] + [Row(min(age)=2)] >>> df3.groupBy().min('age', 'height').collect() - [Row(MIN(age)=2, MIN(height)=80)] + [Row(min(age)=2, min(height)=80)] """ @df_varargs_api @@ -162,9 +162,9 @@ def sum(self, *cols): :param cols: list of column names (string). Non-numeric columns are ignored. >>> df.groupBy().sum('age').collect() - [Row(SUM(age)=7)] + [Row(sum(age)=7)] >>> df3.groupBy().sum('age', 'height').collect() - [Row(SUM(age)=7, SUM(height)=165)] + [Row(sum(age)=7, sum(height)=165)] """ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala index da520f56b430e..64e07bd2a17db 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala @@ -94,7 +94,6 @@ case class Min(child: Expression) extends PartialAggregate with trees.UnaryNode[ override def nullable: Boolean = true override def dataType: DataType = child.dataType - override def toString: String = s"MIN($child)" override def asPartial: SplitEvaluation = { val partialMin = Alias(Min(child), "PartialMin")() @@ -388,6 +387,8 @@ case class ApproxCountDistinct(child: Expression, relativeSD: Double = 0.05) case class Average(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] { + override def prettyName: String = "avg" + override def nullable: Boolean = true override def dataType: DataType = child.dataType match {