### Aggregation (SQL)

1. Min
2. Max
3. Avg
4. Sum


In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Program Files\\spark\\spark-3.4.1-bin-hadoop3'

In [2]:
from pyspark.sql import SparkSession, functions as fs
spark=SparkSession.builder.appName('Aggregation').master('local[2]').getOrCreate()

In [3]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

schema=StructType([
    StructField("cid", StringType(), True),
    StructField("fname", StringType(), True),
    StructField("lname", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("desig", StringType(), True),
])

In [4]:
#permissive mode
df=spark.read\
    .option('mode','permissive')\
        .option('header',True)\
        .schema(schema)\
        .csv('../data/custs_with_header.csv')

df.show(10) #default show 20 records

+-------+--------+----------+---+--------------------+
|    cid|   fname|     lname|age|               desig|
+-------+--------+----------+---+--------------------+
|4000001|Kristina|     Chung| 55|               Pilot|
|4000002|   Paige|      Chen| 74|             Teacher|
|4000003|  Sherri|    Melton| 34|         Firefighter|
|4000004|Gretchen|      Hill| 66|Computer hardware...|
|4000005|   Karen|   Puckett| 74|              Lawyer|
|4000006| Patrick|      Song| 42|        Veterinarian|
|4000007|   Elsie|  Hamilton| 43|               Pilot|
|4000008|   Hazel|    Bender| 63|           Carpenter|
|4000009| Malcolm|    Wagner| 39|              Artist|
|4000010| Dolores|McLaughlin| 60|              Writer|
+-------+--------+----------+---+--------------------+
only showing top 10 rows



In [5]:
df.select(fs.avg('age'),fs.sum('age'),fs.min('age'),fs.max('age')).show()

+-----------------+--------+--------+--------+
|         avg(age)|sum(age)|min(age)|max(age)|
+-----------------+--------+--------+--------+
|48.56805680568057|  485632|      21|      75|
+-----------------+--------+--------+--------+



### Multiple Aggregation


In [17]:
df.groupby('desig').count().orderBy("count",ascending=False).show()

+--------------------+-----+
|               desig|count|
+--------------------+-----+
|          Politician|  227|
|        Photographer|  222|
|Computer support ...|  222|
|        Loan officer|  221|
|           Librarian|  218|
|         Firefighter|  217|
|Computer software...|  216|
|          Pharmacist|  213|
|Human resources a...|  212|
|       Social worker|  212|
|Recreation and fi...|  210|
|      Police officer|  209|
|               Pilot|  209|
|        Veterinarian|  208|
|    Childcare worker|  207|
|             Chemist|  206|
|            Musician|  204|
|            Designer|  204|
|Computer hardware...|  204|
|Engineering techn...|  204|
+--------------------+-----+
only showing top 20 rows



In [22]:
average_age= fs.round(fs.avg('age')).alias('average_age')
df.groupBy('desig')\
    .agg(
        fs.sum('age').alias('Total Age'),
       average_age,
        fs.min('age'),
        fs.max('age')
    )\
        .orderBy('average_age')\
        .show()

+--------------------+---------+-----------+--------+--------+
|               desig|Total Age|average_age|min(age)|max(age)|
+--------------------+---------+-----------+--------+--------+
|               Coach|     9048|       45.0|      21|      75|
|          Pharmacist|     9850|       46.0|      21|      75|
|              Lawyer|     9532|       47.0|      21|      75|
|                null|     3919|       47.0|      21|      75|
|           Secretary|     9398|       47.0|      21|      75|
|              Artist|     8170|       47.0|      21|      75|
|      Civil engineer|     9136|       47.0|      21|      75|
|        Psychologist|     9199|       47.0|      21|      75|
|           Librarian|    10450|       48.0|      21|      75|
|    Childcare worker|     9983|       48.0|      22|      75|
|        Photographer|    10606|       48.0|      21|      75|
|           Carpenter|     8692|       48.0|      21|      75|
|               Pilot|     9989|       48.0|      21|  