In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession \
    .builder \
    .getOrCreate()

Read file

In [16]:
data = (("James","Sales","NY",9000,34),
("Alicia","Sales","NY",8600,56),
("Robert","Sales","CA",8100,30),
("Lisa","Finance","CA",9000,24),
("Deja","Finance","CA",9900,40),
("Sugie","Finance","NY",8300,36),
("Ram","Finance","NY",7900,53),
("Kyle","Marketing","CA",8000,25),
("Reid","Marketing","NY",9100,50))

schema=("empname", "dept", "state","salary","age")
df = spark.createDataFrame (data,schema =schema)
df.show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|  9000| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Robert|    Sales|   CA|  8100| 30|
|   Lisa|  Finance|   CA|  9000| 24|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|    Ram|  Finance|   NY|  7900| 53|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



Using avg(), sum(), min(), max(), count(), agg()

In [23]:
df.groupBy("dept").avg("salary").show()
df.groupBy("dept").max("salary").show()

+---------+-----------------+
|     dept|      avg(salary)|
+---------+-----------------+
|    Sales|8566.666666666666|
|  Finance|           8775.0|
|Marketing|           8550.0|
+---------+-----------------+

+---------+-----------+
|     dept|max(salary)|
+---------+-----------+
|    Sales|       9000|
|  Finance|       9900|
|Marketing|       9100|
+---------+-----------+



Ex2 - Multiple Columns

In [24]:
df.groupBy("dept", "state").min("salary", "age").show()

+---------+-----+-----------+--------+
|     dept|state|min(salary)|min(age)|
+---------+-----+-----------+--------+
|    Sales|   NY|       8600|      34|
|    Sales|   CA|       8100|      30|
|  Finance|   CA|       9000|      24|
|  Finance|   NY|       7900|      36|
|Marketing|   CA|       8000|      25|
|Marketing|   NY|       9100|      50|
+---------+-----+-----------+--------+



Ex3 - Using agg() many aggregation

In [29]:
df.groupBy("dept")\
    .agg(
        max("salary").alias("max_salary"),
        min("salary").alias("min_salary"),
        round(avg("salary"),2).alias("avg_salary")
    )\
.show()

+---------+----------+----------+----------+
|     dept|max_salary|min_salary|avg_salary|
+---------+----------+----------+----------+
|    Sales|      9000|      8100|   8566.67|
|  Finance|      9900|      7900|    8775.0|
|Marketing|      9100|      8000|    8550.0|
+---------+----------+----------+----------+



Using filter or where

In [33]:
df.filter("state == 'NY'")\
    .groupBy("dept")\
    .agg(
        min("salary").alias("min_salary")
    )\
    .filter("min_salary > 8000")\
.show()

+---------+----------+
|     dept|min_salary|
+---------+----------+
|    Sales|      8600|
|Marketing|      9100|
+---------+----------+



pivot()

In [42]:
df_pivot = df.groupBy("dept")\
    .pivot("state")\
    .sum("salary")

df.groupBy("dept")\
    .pivot("state")\
    .sum("salary")\
    .show()

+---------+-----+-----+
|     dept|   CA|   NY|
+---------+-----+-----+
|    Sales| 8100|17600|
|  Finance|18900|16200|
|Marketing| 8000| 9100|
+---------+-----+-----+



unpivot()

In [55]:
df_pivot.selectExpr("dept", "stack(2, 'CA', CA, 'NY', NY) as (state, salary)").show()


+---------+-----+------+
|     dept|state|salary|
+---------+-----+------+
|    Sales|   CA|  8100|
|    Sales|   NY| 17600|
|  Finance|   CA| 18900|
|  Finance|   NY| 16200|
|Marketing|   CA|  8000|
|Marketing|   NY|  9100|
+---------+-----+------+

