### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) Usando groupBy

In [0]:
lista = [['1','alfonso','IT',45000],
         ['2','alfredo','CS',85000],
         ['3','maria','CS',32000],
         ['4','javiera','ECE',98000],
         ['5','tomas','IT',66000],
         ['6','jose','AKA',47000],
         ['7','cristian','PET',12000],
         ['8','nelson','AKA',33000],
        ]

columnas = ['ID','NOMBRE','AREA','SALARIO']
df = spark.createDataFrame(lista,columnas)
df.printSchema()
df.show(truncate=False)

root
 |-- ID: string (nullable = true)
 |-- NOMBRE: string (nullable = true)
 |-- AREA: string (nullable = true)
 |-- SALARIO: long (nullable = true)

+---+--------+----+-------+
|ID |NOMBRE  |AREA|SALARIO|
+---+--------+----+-------+
|1  |alfonso |IT  |45000  |
|2  |alfredo |CS  |85000  |
|3  |maria   |CS  |32000  |
|4  |javiera |ECE |98000  |
|5  |tomas   |IT  |66000  |
|6  |jose    |AKA |47000  |
|7  |cristian|PET |12000  |
|8  |nelson  |AKA |33000  |
+---+--------+----+-------+



#### groupby/sum

In [0]:
df.groupBy('AREA').sum('SALARIO').show()

+----+------------+
|AREA|sum(SALARIO)|
+----+------------+
|  IT|      111000|
|  CS|      117000|
| ECE|       98000|
| AKA|       80000|
| PET|       12000|
+----+------------+



#### groupby/min

In [0]:
df.groupBy('AREA').min('SALARIO').show()

+----+------------+
|AREA|min(SALARIO)|
+----+------------+
|  IT|       45000|
|  CS|       32000|
| ECE|       98000|
| AKA|       33000|
| PET|       12000|
+----+------------+



In [0]:
df.groupBy().min().show()

+------------+
|min(SALARIO)|
+------------+
|       12000|
+------------+



#### groupby/max

In [0]:
df.groupBy('AREA').max('SALARIO').show()

+----+------------+
|AREA|max(SALARIO)|
+----+------------+
|  IT|       66000|
|  CS|       85000|
| ECE|       98000|
| AKA|       47000|
| PET|       12000|
+----+------------+



#### groupby/avg

In [0]:
df.groupBy('AREA').avg('SALARIO').show()

+----+------------+
|AREA|avg(SALARIO)|
+----+------------+
|  IT|     55500.0|
|  CS|     58500.0|
| ECE|     98000.0|
| AKA|     40000.0|
| PET|     12000.0|
+----+------------+



#### groupby/count

In [0]:
df.groupBy('AREA').count().show()

+----+-----+
|AREA|count|
+----+-----+
|  IT|    2|
|  CS|    2|
| ECE|    1|
| AKA|    2|
| PET|    1|
+----+-----+



##### Ejercicio 1

Como seleccionar una columna de acuerdo a una columna 'count'

In [0]:
from pyspark.sql.functions import explode

df = spark.createDataFrame([(1,['blue','winter','cozy']),
                            (2,['red','summer','fresh','cooling']),
                            (3,['green','summer','travel'])],
                           ['item_id','atributos'])

df.printSchema()
df.show(truncate=False)

root
 |-- item_id: long (nullable = true)
 |-- atributos: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------+-----------------------------+
|item_id|atributos                    |
+-------+-----------------------------+
|1      |[blue, winter, cozy]         |
|2      |[red, summer, fresh, cooling]|
|3      |[green, summer, travel]      |
+-------+-----------------------------+



In [0]:
df_explode = df.select(explode('atributos')). \
                groupBy('col'). \
                count(). \
                sort('count', ascending=False). \
                select('col')

df_explode.show()

+-------+
|    col|
+-------+
| summer|
| winter|
|   cozy|
|   blue|
|  fresh|
|    red|
|cooling|
| travel|
|  green|
+-------+



#### groupby/mean

In [0]:
df.groupBy('AREA').mean('SALARIO').show()

+----+------------+
|AREA|avg(SALARIO)|
+----+------------+
|  IT|     55500.0|
|  CS|     58500.0|
| ECE|     98000.0|
| AKA|     40000.0|
| PET|     12000.0|
+----+------------+



#### groupby con múltiples columnas

In [0]:
df.groupBy('NOMBRE','AREA').avg('SALARIO').show()

+--------+----+------------+
|  NOMBRE|AREA|avg(SALARIO)|
+--------+----+------------+
| alfonso|  IT|     45000.0|
| alfredo|  CS|     85000.0|
|   maria|  CS|     32000.0|
| javiera| ECE|     98000.0|
|   tomas|  IT|     66000.0|
|    jose| AKA|     47000.0|
|cristian| PET|     12000.0|
|  nelson| AKA|     33000.0|
+--------+----+------------+



#### groupby/agg

In [0]:
from pyspark.sql.functions import sum, max, min, avg, count, mean

df.groupby('AREA').agg(max('SALARIO'),sum('SALARIO'),min('SALARIO'),mean('SALARIO'),count('SALARIO')).show()

+----+------------+------------+------------+------------+--------------+
|AREA|max(SALARIO)|sum(SALARIO)|min(SALARIO)|avg(SALARIO)|count(SALARIO)|
+----+------------+------------+------------+------------+--------------+
|  IT|       66000|      111000|       45000|     55500.0|             2|
|  CS|       85000|      117000|       32000|     58500.0|             2|
| ECE|       98000|       98000|       98000|     98000.0|             1|
| AKA|       47000|       80000|       33000|     40000.0|             2|
| PET|       12000|       12000|       12000|     12000.0|             1|
+----+------------+------------+------------+------------+--------------+

