### WINDOWING

In [1]:
##Creamos una SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [2]:
##Creamos un DF
simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) \
  ) #Datos
 
columns= ["employee_name", "department", "salary"]#nombre columnas
df = spark.createDataFrame(data = simpleData, schema = columns)#crear DF
df.printSchema()#mostrar esquema
df.show(truncate=False)#mostrar datos

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



**row_number Window Function**

In [15]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [17]:
windowSpec  = Window.partitionBy("department").orderBy("salary") #particionamos por departamento y ordenamos por salario
df.withColumn("row_number",row_number().over(windowSpec)).show(truncate=False) #numeramos las filas por departamento con salario ascendente

+-------------+----------+------+----------+
|employee_name|department|salary|row_number|
+-------------+----------+------+----------+
|James        |Sales     |3000  |1         |
|James        |Sales     |3000  |2         |
|Robert       |Sales     |4100  |3         |
|Saif         |Sales     |4100  |4         |
|Michael      |Sales     |4600  |5         |
|Maria        |Finance   |3000  |1         |
|Scott        |Finance   |3300  |2         |
|Jen          |Finance   |3900  |3         |
|Kumar        |Marketing |2000  |1         |
|Jeff         |Marketing |3000  |2         |
+-------------+----------+------+----------+



**rank Window Function**

Esta funcion asigna un ranking por departamento en base al salario de menor a mayor. Si hay empate asigna el mismo puesto.

In [7]:
from pyspark.sql.functions import rank
df.withColumn("rank",rank().over(windowSpec)).show() 

+-------------+----------+------+----+
|employee_name|department|salary|rank|
+-------------+----------+------+----+
|        James|     Sales|  3000|   1|
|        James|     Sales|  3000|   1|
|       Robert|     Sales|  4100|   3|
|         Saif|     Sales|  4100|   3|
|      Michael|     Sales|  4600|   5|
|        Maria|   Finance|  3000|   1|
|        Scott|   Finance|  3300|   2|
|          Jen|   Finance|  3900|   3|
|        Kumar| Marketing|  2000|   1|
|         Jeff| Marketing|  3000|   2|
+-------------+----------+------+----+



**dense_rank Window Function**

In [8]:
from pyspark.sql.functions import dense_rank
df.withColumn("dense_rank",dense_rank().over(windowSpec)).show() #igual que el anterior pero no se salta puestos cuando hay empates

+-------------+----------+------+----------+
|employee_name|department|salary|dense_rank|
+-------------+----------+------+----------+
|        James|     Sales|  3000|         1|
|        James|     Sales|  3000|         1|
|       Robert|     Sales|  4100|         2|
|         Saif|     Sales|  4100|         2|
|      Michael|     Sales|  4600|         3|
|        Maria|   Finance|  3000|         1|
|        Scott|   Finance|  3300|         2|
|          Jen|   Finance|  3900|         3|
|        Kumar| Marketing|  2000|         1|
|         Jeff| Marketing|  3000|         2|
+-------------+----------+------+----------+



**percent_rank Window Function**

In [9]:
from pyspark.sql.functions import percent_rank
df.withColumn("percent_rank",percent_rank().over(windowSpec)).show() #igual que la anterior pero asignando "porcentajes"

+-------------+----------+------+------------+
|employee_name|department|salary|percent_rank|
+-------------+----------+------+------------+
|        James|     Sales|  3000|         0.0|
|        James|     Sales|  3000|         0.0|
|       Robert|     Sales|  4100|         0.5|
|         Saif|     Sales|  4100|         0.5|
|      Michael|     Sales|  4600|         1.0|
|        Maria|   Finance|  3000|         0.0|
|        Scott|   Finance|  3300|         0.5|
|          Jen|   Finance|  3900|         1.0|
|        Kumar| Marketing|  2000|         0.0|
|         Jeff| Marketing|  3000|         1.0|
+-------------+----------+------+------------+



**ntile Window Function**

In [11]:
from pyspark.sql.functions import ntile
df.withColumn("ntile",ntile(2).over(windowSpec)).show() #te devuelve un rango relativo entre 1 y el valor de la funcion ntile

+-------------+----------+------+-----+
|employee_name|department|salary|ntile|
+-------------+----------+------+-----+
|        James|     Sales|  3000|    1|
|        James|     Sales|  3000|    1|
|       Robert|     Sales|  4100|    1|
|         Saif|     Sales|  4100|    2|
|      Michael|     Sales|  4600|    2|
|        Maria|   Finance|  3000|    1|
|        Scott|   Finance|  3300|    1|
|          Jen|   Finance|  3900|    2|
|        Kumar| Marketing|  2000|    1|
|         Jeff| Marketing|  3000|    2|
+-------------+----------+------+-----+



**cume_dist Window Function**

In [12]:
from pyspark.sql.functions import cume_dist    
df.withColumn("cume_dist",cume_dist().over(windowSpec)).show() #distribucion acumulada de los valores:
#en sales hay 5 valores, por lo que a cada uno le corresponde 0.2, como hay 4 valores que se repiten 2 a 2 se suma
#en finance hay tres valores, por lo que le corresponde 0.3 periodo a cada uno
#en marketing hay dos valores, por lo que corresponde 0.5 a cada uno

+-------------+----------+------+------------------+
|employee_name|department|salary|         cume_dist|
+-------------+----------+------+------------------+
|        James|     Sales|  3000|               0.4|
|        James|     Sales|  3000|               0.4|
|       Robert|     Sales|  4100|               0.8|
|         Saif|     Sales|  4100|               0.8|
|      Michael|     Sales|  4600|               1.0|
|        Maria|   Finance|  3000|0.3333333333333333|
|        Scott|   Finance|  3300|0.6666666666666666|
|          Jen|   Finance|  3900|               1.0|
|        Kumar| Marketing|  2000|               0.5|
|         Jeff| Marketing|  3000|               1.0|
+-------------+----------+------+------------------+



**lag Window Function**

In [14]:
from pyspark.sql.functions import lag    
df.withColumn("lag",lag("salary",2).over(windowSpec)).show() #devuelve valores anteriores con el lag establecido, en este caso 
#devuelve el valor de dos filas anteriores

+-------------+----------+------+----+
|employee_name|department|salary| lag|
+-------------+----------+------+----+
|        James|     Sales|  3000|null|
|        James|     Sales|  3000|null|
|       Robert|     Sales|  4100|3000|
|         Saif|     Sales|  4100|3000|
|      Michael|     Sales|  4600|4100|
|        Maria|   Finance|  3000|null|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|3000|
|        Kumar| Marketing|  2000|null|
|         Jeff| Marketing|  3000|null|
+-------------+----------+------+----+



**lead Window Function**

In [16]:
from pyspark.sql.functions import lead    
df.withColumn("lead",lead("salary",2).over(windowSpec)).show() #devuelve los valores posteriores con el lead establecido, en este
#caso devuelve el valor de las dos filas posteriores

+-------------+----------+------+----+
|employee_name|department|salary|lead|
+-------------+----------+------+----+
|        James|     Sales|  3000|4100|
|        James|     Sales|  3000|4100|
|       Robert|     Sales|  4100|4600|
|         Saif|     Sales|  4100|null|
|      Michael|     Sales|  4600|null|
|        Maria|   Finance|  3000|3900|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|null|
|        Kumar| Marketing|  2000|null|
|         Jeff| Marketing|  3000|null|
+-------------+----------+------+----+



**Window Aggregate Functions**

In [18]:
windowSpecAgg  = Window.partitionBy("department") #particionamos por departamento

from pyspark.sql.functions import col,avg,sum,min,max,row_number #cargamos las funciones

df.withColumn("row",row_number().over(windowSpec)) \
  .withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
  .withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
  .withColumn("min", min(col("salary")).over(windowSpecAgg)) \
  .withColumn("max", max(col("salary")).over(windowSpecAgg)) \
  .where(col("row")==1).select("*") \
  .show()
#en este caso lo que hace es coger la columna salario y la agrega por departamento, devolviendo asi las distintas metricas pedidas

+-------------+----------+------+---+------+-----+----+----+
|employee_name|department|salary|row|   avg|  sum| min| max|
+-------------+----------+------+---+------+-----+----+----+
|        James|     Sales|  3000|  1|3760.0|18800|3000|4600|
|        Maria|   Finance|  3000|  1|3400.0|10200|3000|3900|
|        Kumar| Marketing|  2000|  1|2500.0| 5000|2000|3000|
+-------------+----------+------+---+------+-----+----+----+



In [19]:
df.withColumn("row",row_number().over(windowSpec)) \
  .withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
  .withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
  .withColumn("min", min(col("salary")).over(windowSpecAgg)) \
  .withColumn("max", max(col("salary")).over(windowSpecAgg)) \
  .select("*") \
  .show()

+-------------+----------+------+---+------+-----+----+----+
|employee_name|department|salary|row|   avg|  sum| min| max|
+-------------+----------+------+---+------+-----+----+----+
|        James|     Sales|  3000|  1|3760.0|18800|3000|4600|
|        James|     Sales|  3000|  2|3760.0|18800|3000|4600|
|       Robert|     Sales|  4100|  3|3760.0|18800|3000|4600|
|         Saif|     Sales|  4100|  4|3760.0|18800|3000|4600|
|      Michael|     Sales|  4600|  5|3760.0|18800|3000|4600|
|        Maria|   Finance|  3000|  1|3400.0|10200|3000|3900|
|        Scott|   Finance|  3300|  2|3400.0|10200|3000|3900|
|          Jen|   Finance|  3900|  3|3400.0|10200|3000|3900|
|        Kumar| Marketing|  2000|  1|2500.0| 5000|2000|3000|
|         Jeff| Marketing|  3000|  2|2500.0| 5000|2000|3000|
+-------------+----------+------+---+------+-----+----+----+



**Hacer con JOIN**

In [8]:
import pyspark.sql.functions as F
dfjoin = df.groupBy("department")\
           .agg(F.avg("salary").alias("mean"),
                F.sum("salary").alias("sum"),
                F.min("salary").alias("min"),
                F.max("salary").alias("max"))

dfjoin.select("*").show()

+----------+------+-----+----+----+
|department|  mean|  sum| min| max|
+----------+------+-----+----+----+
|     Sales|3760.0|18800|3000|4600|
|   Finance|3400.0|10200|3000|3900|
| Marketing|2500.0| 5000|2000|3000|
+----------+------+-----+----+----+



In [13]:
df1 = df.alias("df1")
df2 = dfjoin.alias("df2")
df_final = df1.join(df2, (df1.department == df2.department), how="left"). select(df1["*"], df2["mean"], df2["sum"],df2["max"], df2["min"])
df_final.select("*").show()

+-------------+----------+------+------+-----+----+----+
|employee_name|department|salary|  mean|  sum| max| min|
+-------------+----------+------+------+-----+----+----+
|        James|     Sales|  3000|3760.0|18800|4600|3000|
|      Michael|     Sales|  4600|3760.0|18800|4600|3000|
|       Robert|     Sales|  4100|3760.0|18800|4600|3000|
|        James|     Sales|  3000|3760.0|18800|4600|3000|
|         Saif|     Sales|  4100|3760.0|18800|4600|3000|
|        Maria|   Finance|  3000|3400.0|10200|3900|3000|
|        Scott|   Finance|  3300|3400.0|10200|3900|3000|
|          Jen|   Finance|  3900|3400.0|10200|3900|3000|
|         Jeff| Marketing|  3000|2500.0| 5000|3000|2000|
|        Kumar| Marketing|  2000|2500.0| 5000|3000|2000|
+-------------+----------+------+------+-----+----+----+



### PIVOTING

In [20]:
data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \
      ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), \
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")] #Creamos los datos

columns= ["Product","Amount","Country"] #nombre a las columnas
df = spark.createDataFrame(data = data, schema = columns) #creamos DF
df.printSchema() #mostramos esquema
df.show(truncate=False)#mostramos DF

root
 |-- Product: string (nullable = true)
 |-- Amount: long (nullable = true)
 |-- Country: string (nullable = true)

+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
|Banana |1000  |USA    |
|Carrots|1500  |USA    |
|Beans  |1600  |USA    |
|Orange |2000  |USA    |
|Orange |2000  |USA    |
|Banana |400   |China  |
|Carrots|1200  |China  |
|Beans  |1500  |China  |
|Orange |4000  |China  |
|Banana |2000  |Canada |
|Carrots|2000  |Canada |
|Beans  |2000  |Mexico |
+-------+------+-------+



In [22]:
#Utilizamos los valores de la columna country para formar nuevas columnas y agrupamos los datos por la suma de la cantidad
pivotDF=df.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.printSchema()
pivotDF.show(truncate=False)

root
 |-- Product: string (nullable = true)
 |-- Canada: long (nullable = true)
 |-- China: long (nullable = true)
 |-- Mexico: long (nullable = true)
 |-- USA: long (nullable = true)

+-------+------+-----+------+----+
|Product|Canada|China|Mexico|USA |
+-------+------+-----+------+----+
|Orange |null  |4000 |null  |4000|
|Beans  |null  |1500 |2000  |1600|
|Banana |2000  |400  |null  |1000|
|Carrots|2000  |1200 |null  |1500|
+-------+------+-----+------+----+



In [24]:
#Otra forma de hacerlo
countries = ["USA","China","Canada","Mexico"]
pivotDF = df.groupBy("Product").pivot("Country", countries).sum("Amount")
pivotDF.show(truncate=False)

+-------+----+-----+------+------+
|Product|USA |China|Canada|Mexico|
+-------+----+-----+------+------+
|Orange |4000|4000 |null  |null  |
|Beans  |1600|1500 |null  |2000  |
|Banana |1000|400  |2000  |null  |
|Carrots|1500|1200 |2000  |null  |
+-------+----+-----+------+------+



In [26]:
#Otra forma a traves de dos agrupaciones
pivotDF = df.groupBy("Product","Country") \
      .sum("Amount") \
      .groupBy("Product") \
      .pivot("Country") \
      .sum("sum(Amount)")

pivotDF.show(truncate=False)

+-------+------+-----+------+----+
|Product|Canada|China|Mexico|USA |
+-------+------+-----+------+----+
|Orange |null  |4000 |null  |4000|
|Beans  |null  |1500 |2000  |1600|
|Banana |2000  |400  |null  |1000|
|Carrots|2000  |1200 |null  |1500|
+-------+------+-----+------+----+



### UNPIVOT

In [28]:
from pyspark.sql.functions import expr

unpivotExpr = "stack(3, 'Canada', Canada, 'China', China, 'Mexico', Mexico) as (Country,Total)"

unPivotDF = pivotDF.select("Product", expr(unpivotExpr)) \
    .where("Total is not null")

unPivotDF.show(truncate=False)

+-------+-------+-----+
|Product|Country|Total|
+-------+-------+-----+
|Orange |China  |4000 |
|Beans  |China  |1500 |
|Beans  |Mexico |2000 |
|Banana |Canada |2000 |
|Banana |China  |400  |
|Carrots|Canada |2000 |
|Carrots|China  |1200 |
+-------+-------+-----+

