<a href="https://colab.research.google.com/github/margaridagomes/dataeng-basic-course/blob/main/spark/examples/05-aggregations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/05-aggregations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aggregations
- Group By
- Windows Functions

# Setting up PySpark

In [1]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()

# Aggregations

https://spark.apache.org/docs/latest/sql-ref-functions-builtin.html#aggregate-functions

https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-aggregate.html

In [23]:
sql_query = """CREATE OR REPLACE TEMPORARY VIEW basic_pays AS SELECT * FROM VALUES
('Diane Murphy','Accounting',8435),
('Mary Patterson','Accounting',9998),
('Jeff Firrelli','Accounting',8992),
('William Patterson','Accounting',8870),
('Gerard Bondur','Accounting',11472),
('Anthony Bow','Accounting',6627),
('Leslie Jennings','IT',8113),
('Leslie Thompson','IT',5186),
('Julie Firrelli','Sales',9181),
('Steve Patterson','Sales',9441),
('Foon Yue Tseng','Sales',6660),
('George Vanauf','Sales',10563),
('Loui Bondur','SCM',10449),
('Gerard Hernandez','SCM',6949),
('Pamela Castillo','SCM',11303),
('Larry Bott','SCM',11798),
('Barry Jones','SCM',10586)
AS basic_pays(employee_name, department, salary)"""

# creating temp view
spark.sql(sql_query)

df = spark.table("basic_pays")
df.show()


+-----------------+----------+------+
|    employee_name|department|salary|
+-----------------+----------+------+
|     Diane Murphy|Accounting|  8435|
|   Mary Patterson|Accounting|  9998|
|    Jeff Firrelli|Accounting|  8992|
|William Patterson|Accounting|  8870|
|    Gerard Bondur|Accounting| 11472|
|      Anthony Bow|Accounting|  6627|
|  Leslie Jennings|        IT|  8113|
|  Leslie Thompson|        IT|  5186|
|   Julie Firrelli|     Sales|  9181|
|  Steve Patterson|     Sales|  9441|
|   Foon Yue Tseng|     Sales|  6660|
|    George Vanauf|     Sales| 10563|
|      Loui Bondur|       SCM| 10449|
| Gerard Hernandez|       SCM|  6949|
|  Pamela Castillo|       SCM| 11303|
|       Larry Bott|       SCM| 11798|
|      Barry Jones|       SCM| 10586|
+-----------------+----------+------+



In [4]:
df

DataFrame[employee_name: string, department: string, salary: int]

In [5]:
df.write.mode("overwrite").saveAsTable("basic_pays")

In [24]:
perc_query = """SELECT
    department,
    percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) AS pc1,
    percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) FILTER (WHERE employee_name LIKE '%Bo%') AS pc2,
    percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) AS pc3,
    percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) FILTER (WHERE employee_name LIKE '%Bo%') AS pc4,
    percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) AS pd1,
    percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) FILTER (WHERE employee_name LIKE '%Bo%') AS pd2,
    percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) AS pd3,
    percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) FILTER (WHERE employee_name LIKE '%Bo%') AS pd4
FROM basic_pays
GROUP BY department
ORDER BY department;"""

spark.sql(perc_query).show()

+----------+-------+--------+-------+--------+-------+-------+-------+-------+
|department|    pc1|     pc2|    pc3|     pc4|    pd1|    pd2|    pd3|    pd4|
+----------+-------+--------+-------+--------+-------+-------+-------+-------+
|Accounting|8543.75| 7838.25| 9746.5|10260.75| 8435.0| 6627.0| 9998.0|11472.0|
|        IT|5917.75|    NULL|7381.25|    NULL| 5186.0|   NULL| 8113.0|   NULL|
|       SCM|10449.0|10786.25|11303.0|11460.75|10449.0|10449.0|11303.0|11798.0|
|     Sales|8550.75|    NULL| 9721.5|    NULL| 6660.0|   NULL|10563.0|   NULL|
+----------+-------+--------+-------+--------+-------+-------+-------+-------+



In [25]:
from pyspark.sql.functions import *
d2 = (df
 .groupBy("department")
 .agg(sum("salary").alias("sum_salary"),
      round(avg("salary"),2).alias("avg_salary"),
      min("salary").alias("min_salary"),
      array_agg("employee_name").alias("employees"),
      count(lit("")).alias("count_employees"))
 .filter(col("count_employees") > 2)
 #.show(10, False)
 )

#transformation

In [9]:
d2.count()

3

In [26]:
d2.show()

+----------+----------+----------+----------+--------------------+---------------+
|department|sum_salary|avg_salary|min_salary|           employees|count_employees|
+----------+----------+----------+----------+--------------------+---------------+
|     Sales|     35845|   8961.25|      6660|[Julie Firrelli, ...|              4|
|Accounting|     54394|   9065.67|      6627|[Diane Murphy, Ma...|              6|
|       SCM|     51085|   10217.0|      6949|[Loui Bondur, Ger...|              5|
+----------+----------+----------+----------+--------------------+---------------+



# Question

In [None]:
# Q1
# Aggregate data by surname
# Calculate highest salary by surname
# Include the respective employee that has the highest salary
# Include department information about this employee
# Count how many employees has that surname
# Put in an array all the first_names of the respective surname ordered


# schema expected:
# surname | count_employees | highest_salary | employee_with_highest_salary | department_with_highest_salary | array_with_all_the_first_names |

In [27]:
df.show()

+-----------------+----------+------+
|    employee_name|department|salary|
+-----------------+----------+------+
|     Diane Murphy|Accounting|  8435|
|   Mary Patterson|Accounting|  9998|
|    Jeff Firrelli|Accounting|  8992|
|William Patterson|Accounting|  8870|
|    Gerard Bondur|Accounting| 11472|
|      Anthony Bow|Accounting|  6627|
|  Leslie Jennings|        IT|  8113|
|  Leslie Thompson|        IT|  5186|
|   Julie Firrelli|     Sales|  9181|
|  Steve Patterson|     Sales|  9441|
|   Foon Yue Tseng|     Sales|  6660|
|    George Vanauf|     Sales| 10563|
|      Loui Bondur|       SCM| 10449|
| Gerard Hernandez|       SCM|  6949|
|  Pamela Castillo|       SCM| 11303|
|       Larry Bott|       SCM| 11798|
|      Barry Jones|       SCM| 10586|
+-----------------+----------+------+



In [16]:
from pyspark.sql import functions as F

df.withColumn("new_column",
              F.concat(F.col("employee_name"),F.lit("__"))).show()

+-----------------+----------+------+-------------------+
|    employee_name|department|salary|         new_column|
+-----------------+----------+------+-------------------+
|     Diane Murphy|Accounting|  8435|     Diane Murphy__|
|   Mary Patterson|Accounting|  9998|   Mary Patterson__|
|    Jeff Firrelli|Accounting|  8992|    Jeff Firrelli__|
|William Patterson|Accounting|  8870|William Patterson__|
|    Gerard Bondur|Accounting| 11472|    Gerard Bondur__|
|      Anthony Bow|Accounting|  6627|      Anthony Bow__|
|  Leslie Jennings|        IT|  8113|  Leslie Jennings__|
|  Leslie Thompson|        IT|  5186|  Leslie Thompson__|
|   Julie Firrelli|     Sales|  9181|   Julie Firrelli__|
|  Steve Patterson|     Sales|  9441|  Steve Patterson__|
|   Foon Yue Tseng|     Sales|  6660|   Foon Yue Tseng__|
|    George Vanauf|     Sales| 10563|    George Vanauf__|
|      Loui Bondur|       SCM| 10449|      Loui Bondur__|
| Gerard Hernandez|       SCM|  6949| Gerard Hernandez__|
|  Pamela Cast

In [41]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Split employee_name em palavras (array)
df_names = df.withColumn("first_name", F.split(F.col("employee_name"), " ")[0]).withColumn("surname", F.split(F.col("employee_name"), " ")[1])

df_names.show()

# Janela para particionar por surname e ordenar por salário descendente
window_surname = Window.partitionBy("surname").orderBy(F.desc("salary"))

# Adicionar número da linha para cada grupo (o maior salário vai ser o número 1)
df_surname = df_names.withColumn("row_number", F.row_number().over(window_surname))

# Filtrar só os que têm row_number = 1 (ou seja, o maior salário de cada sobrenome)
df_highest_salary = df_surname.filter(F.col("row_number") == 1)

df_highest_salary.select(
    "surname",
    "employee_name",
    "salary",
    "department"
).show()

# Contagem de funcionários e array ordenado dos primeiros nomes
df_agg = df_names.groupBy("surname").agg(
    F.count("*").alias("count_employees"),
    F.array_sort(F.collect_list("first_name")).alias("array_with_all_the_first_names")
)

# Junta tudo (resultado final!)
df_final = df_highest_salary.join(df_agg, on="surname", how="inner")

# schema expected:
# surname | count_employees | highest_salary | employee_with_highest_salary | department_with_highest_salary | array_with_all_the_first_names |
df_final.select(
    "surname",
    "count_employees",
    "salary",
    "employee_name",
    "department",
    "array_with_all_the_first_names"
).show()


+-----------------+----------+------+----------+---------+
|    employee_name|department|salary|first_name|  surname|
+-----------------+----------+------+----------+---------+
|     Diane Murphy|Accounting|  8435|     Diane|   Murphy|
|   Mary Patterson|Accounting|  9998|      Mary|Patterson|
|    Jeff Firrelli|Accounting|  8992|      Jeff| Firrelli|
|William Patterson|Accounting|  8870|   William|Patterson|
|    Gerard Bondur|Accounting| 11472|    Gerard|   Bondur|
|      Anthony Bow|Accounting|  6627|   Anthony|      Bow|
|  Leslie Jennings|        IT|  8113|    Leslie| Jennings|
|  Leslie Thompson|        IT|  5186|    Leslie| Thompson|
|   Julie Firrelli|     Sales|  9181|     Julie| Firrelli|
|  Steve Patterson|     Sales|  9441|     Steve|Patterson|
|   Foon Yue Tseng|     Sales|  6660|      Foon|      Yue|
|    George Vanauf|     Sales| 10563|    George|   Vanauf|
|      Loui Bondur|       SCM| 10449|      Loui|   Bondur|
| Gerard Hernandez|       SCM|  6949|    Gerard|Hernande