In [0]:
data = [(1, 'Maheer', 'M', 5000, 'IT'),\
        (2, 'Ayesha', 'F', 5000, 'HR'),\
        (1, 'Mahesh', 'M', 5000, 'CS'),\
        (1, 'Govind', 'M', 5000, 'HR'),\
        (1, 'Rita', 'F', 5000, 'CS'),\
        (1, 'Asi', 'M', 5000, 'CS')]

schema = ('Id','Name', 'Gender', 'Salary', 'Dept')

df = spark.createDataFrame(data, schema)
df.show()

+---+------+------+------+----+
| Id|  Name|Gender|Salary|Dept|
+---+------+------+------+----+
|  1|Maheer|     M|  5000|  IT|
|  2|Ayesha|     F|  5000|  HR|
|  1|Mahesh|     M|  5000|  CS|
|  1|Govind|     M|  5000|  HR|
|  1|  Rita|     F|  5000|  CS|
|  1|   Asi|     M|  5000|  CS|
+---+------+------+------+----+



In [0]:
df.groupBy('Dept').count().show()

+----+-----+
|Dept|count|
+----+-----+
|  IT|    1|
|  HR|    2|
|  CS|    3|
+----+-----+



In [0]:
type(df.groupBy('Dept'))

Out[6]: pyspark.sql.group.GroupedData

In [0]:
df.groupBy(df.Dept).min('Salary').show()

+----+-----------+
|Dept|min(Salary)|
+----+-----------+
|  IT|       5000|
|  HR|       5000|
|  CS|       5000|
+----+-----------+



In [0]:
df.groupBy(df.Dept).min().show()
#since column name is not provided, it performs min operation on all integer columns

+----+-------+-----------+
|Dept|min(Id)|min(Salary)|
+----+-------+-----------+
|  IT|      1|       5000|
|  HR|      1|       5000|
|  CS|      1|       5000|
+----+-------+-----------+



In [0]:
df.groupBy('Dept', 'Gender').count().show()

+----+------+-----+
|Dept|Gender|count|
+----+------+-----+
|  IT|     M|    1|
|  HR|     F|    1|
|  CS|     M|    2|
|  HR|     M|    1|
|  CS|     F|    1|
+----+------+-----+



In [0]:
help(df.groupBy('dept').agg)

Help on method agg in module pyspark.sql.group:

agg(*exprs: Union[pyspark.sql.column.Column, Dict[str, str]]) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.group.GroupedData instance
    Compute aggregates and returns the result as a :class:`DataFrame`.
    
    The available aggregate functions can be:
    
    1. built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`
    
    2. group aggregate pandas UDFs, created with :func:`pyspark.sql.functions.pandas_udf`
    
       .. note:: There is no partial aggregation with group aggregate UDFs, i.e.,
           a full shuffle is required. Also, all the data of a group will be loaded into
           memory, so the user should be aware of the potential OOM risk if data is skewed
           and certain groups are too large to fit in memory.
    
       .. seealso:: :func:`pyspark.sql.functions.pandas_udf`
    
    If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
    is the co

In [0]:
# We can apply multiple aggregation functions at a time using agg()

from pyspark.sql.functions import count, min, max

help(count)

Help on function count in module pyspark.sql.functions:

count(col: 'ColumnOrName') -> pyspark.sql.column.Column
    Aggregate function: returns the number of items in a group.
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Support Spark Connect.
    
    Parameters
    ----------
    col : :class:`~pyspark.sql.Column` or str
        target column to compute on.
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        column for computed results.
    
    Examples
    --------
    Count by all columns (start), and by a column that does not count ``None``.
    
    >>> df = spark.createDataFrame([(None,), ("a",), ("b",), ("c",)], schema=["alphabets"])
    >>> df.select(count(expr("*")), count(df.alphabets)).show()
    +--------+----------------+
    |count(1)|count(alphabets)|
    +--------+----------------+
    |       4|               3|
    +--------+----------------+



In [0]:
df.groupBy('Dept').agg(count('*').alias('count of employees'),\
                        min('Salary').alias('Min of Salary'),\
                        max('Salary').alias('Max of Salary')).show()

+----+------------------+-------------+-------------+
|Dept|count of employees|Min of Salary|Max of Salary|
+----+------------------+-------------+-------------+
|  IT|                 1|         5000|         5000|
|  HR|                 2|         5000|         5000|
|  CS|                 3|         5000|         5000|
+----+------------------+-------------+-------------+

