# PySpark # 2

##### Índice:

    # Importação bibliotecas / funções
    # Criar Sessão PySpark
    # Criar DF / ler arquivo
    # Drop de Colunas
    
######    Window Ranking Function
    # Window Function 1 - Numero de linhas - row_number()
    # Window Function 2 - Ranking 1 - rank()
    # Window Function 3 - Ranking 2 - dense_rank()
    # Window Function 4 - Porcentagem Ranking - percent_rank()
    # Window Function 5 - Divisão em ' N ' partes - ntile()
    
######    Window Analytic Functions (Funções analíticas)
    # Window Function 6 - LAG / Degrau - lag()
    # Window Function 7 - Lead / Degrau - lead()
    # Agregações
    # GroupBy + AGG 1
    # Where
    # Describe
    # Window Function 8 - Função de agregação usando Window Function

##### Importação bibliotecas / funções

In [1]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [16]:
from pyspark.sql.window import Window # Importando window function

##### Criar / Iniciar Sessão PySpark

In [None]:
spark = (
    SparkSession.builder
    .master('local')
    .appName('PySpark_02')
    .getOrCreate()
)

##### Criar DF / ler arquivo

In [None]:
df = spark.read.csv('Arquivos/wc2018-players.csv', header=True, inferSchema=True)

##### Exibir DF

In [19]:
df.show(5)

+---------+---+----+------------------+----------+----------+--------------------+------+------+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
|Argentina|  3|  DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)|   169|    65|
|Argentina| 22|  MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...|   169|    65|
|Argentina| 15|  MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...|   167|    66|
|Argentina| 18|  DF|    SALVIO Eduardo|13.07.1990|    SALVIO|    SL Benfica (POR)|   167|    69|
|Argentina| 10|  FW|      MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)|   170|    72|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
only showing top 5 rows



##### Alterações Aula PySpark 01

In [None]:
df = df.withColumnRenamed('Team', 'Selecao').withColumnRenamed('#', 'Numero').withColumnRenamed('Pos.', 'Posicao')\
.withColumnRenamed('FIFA Popular Name', 'Nome_FIFA').withColumnRenamed('Birth Date', 'Nascimento')\
.withColumnRenamed('Shirt Name', 'Nome Camiseta').withColumnRenamed('Club', 'Time').withColumnRenamed('Height', 'Altura')\
.withColumnRenamed('Weight', 'Peso')

In [None]:
dia = udf(lambda data: data.split('.')[0])
mes = udf(lambda data: data.split('.')[1])
ano = udf(lambda data: data.split('.')[2])

In [None]:
df = df.withColumn('Dia', dia('Nascimento')).withColumn('Mes', mes('Nascimento')).withColumn('Ano', ano('nascimento'))
df = df.withColumn('Data_Nascimento', concat_ws('-', 'Ano', 'Mes', 'Dia').cast(DateType()))
df.show(5)

In [None]:
+---------+------+-------+------------------+----------+-------------+--------------------+------+----+---+---+----+---------------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nascimento|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|
+---------+------+-------+------------------+----------+-------------+--------------------+------+----+---+---+----+---------------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|31.08.1992|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|
|Argentina|    22|     MF|    PAVON Cristian|21.01.1996|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|
|Argentina|    15|     MF|    LANZINI Manuel|15.02.1993|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|
|Argentina|    18|     DF|    SALVIO Eduardo|13.07.1990|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|
|Argentina|    10|     FW|      MESSI Lionel|24.06.1987|        MESSI|  FC Barcelona (ESP)|   170|  72| 24| 06|1987|     1987-06-24|
+---------+------+-------+------------------+----------+-------------+--------------------+------+----+---+---+----+---------------+

In [23]:
df.printSchema()

root
 |-- Selecao: string (nullable = true)
 |-- Numero: integer (nullable = true)
 |-- Posicao: string (nullable = true)
 |-- Nome_FIFA: string (nullable = true)
 |-- Nascimento: string (nullable = true)
 |-- Nome Camiseta: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Altura: integer (nullable = true)
 |-- Peso: integer (nullable = true)
 |-- Dia: string (nullable = true)
 |-- Mes: string (nullable = true)
 |-- Ano: string (nullable = true)
 |-- Data_Nascimento: date (nullable = true)



##### Fim das alterações Aula PySpark 01

In [24]:
df.show(5)

+---------+------+-------+------------------+----------+-------------+--------------------+------+----+---+---+----+---------------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nascimento|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|
+---------+------+-------+------------------+----------+-------------+--------------------+------+----+---+---+----+---------------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|31.08.1992|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|
|Argentina|    22|     MF|    PAVON Cristian|21.01.1996|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|
|Argentina|    15|     MF|    LANZINI Manuel|15.02.1993|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|
|Argentina|    18|     DF|    SALVIO Eduardo|13.07.1990|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|
|Argentina|    10|     FW|      MESSI Lionel|24.06.1987|        MESSI

##### Drop de Colunas

In [27]:
df = df.drop('Nascimento')

In [28]:
df.show(5)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|
|Argentina|    22|     MF|    PAVON Cristian|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|
|Argentina|    15|     MF|    LANZINI Manuel|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|
|Argentina|    18|     DF|    SALVIO Eduardo|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|
|Argentina|    10|     FW|      MESSI Lionel|        MESSI|  FC Barcelona (ESP)|   170|  72| 24| 06|1987|     1987-06-24|
+---------+------+------

##### Criar Backup

In [29]:
df2 = df

#### Window Ranking Functions

- Window Function 1 - Numero de linhas - row_number()
- Window Function 2 - Ranking 1 - rank()
- Window Function 3 - Ranking 2 - dense_rank()
- Window Function 4 - Porcentagem Ranking - percent_rank()
- Window Function 5 - Divisão em ' N ' partes - ntile()

In [30]:
df.show(5)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|
|Argentina|    22|     MF|    PAVON Cristian|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|
|Argentina|    15|     MF|    LANZINI Manuel|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|
|Argentina|    18|     DF|    SALVIO Eduardo|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|
|Argentina|    10|     FW|      MESSI Lionel|        MESSI|  FC Barcelona (ESP)|   170|  72| 24| 06|1987|     1987-06-24|
+---------+------+------

##### Window Function 1 - Numero de linhas - row_number()

In [34]:
num_linha = Window.partitionBy('Selecao').orderBy(desc('Altura'))

df.withColumn('n', row_number().over(num_linha)).show(50)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+---+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|  n|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+---+
|Argentina|     6|     DF|    FAZIO Federico|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|  1|
|Argentina|     1|     GK|     GUZMAN Nahuel|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|  2|
|Argentina|    16|     DF|       ROJO Marcos|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|  3|
|Argentina|    12|     GK|     ARMANI Franco|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|  4|
|Argentina|    23|     GK|CABALLERO Wilfredo|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981|     1981-09-

##### Window Function 2 - Ranking 1 - rank()

In [35]:
rank1 = Window.partitionBy('Selecao').orderBy(desc('Altura'))

df.withColumn('rank', rank().over(rank1)).show(50)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+----+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|rank|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+----+
|Argentina|     6|     DF|    FAZIO Federico|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|   1|
|Argentina|     1|     GK|     GUZMAN Nahuel|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|   2|
|Argentina|    16|     DF|       ROJO Marcos|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|   3|
|Argentina|    12|     GK|     ARMANI Franco|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|   3|
|Argentina|    23|     GK|CABALLERO Wilfredo|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981|     1

##### Window Function 3 - Ranking 2 - dense_rank()

In [36]:
rank2 = Window.partitionBy('Selecao').orderBy(desc('Altura'))

df.withColumn('rank2', dense_rank().over(rank2)).show(50)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+-----+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|rank2|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+-----+
|Argentina|     6|     DF|    FAZIO Federico|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|    1|
|Argentina|     1|     GK|     GUZMAN Nahuel|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|    2|
|Argentina|    16|     DF|       ROJO Marcos|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|    3|
|Argentina|    12|     GK|     ARMANI Franco|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|    3|
|Argentina|    23|     GK|CABALLERO Wilfredo|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981

##### Window Function 4 - Porcentagem Ranking - percent_rank()

In [37]:
porcentagem = Window.partitionBy('Selecao').orderBy(desc('Altura'))

df.withColumn('%', percent_rank().over(porcentagem)).show(50)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+--------------------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|                   %|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+--------------------+
|Argentina|     6|     DF|    FAZIO Federico|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|                 0.0|
|Argentina|     1|     GK|     GUZMAN Nahuel|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|0.045454545454545456|
|Argentina|    16|     DF|       ROJO Marcos|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20| 0.09090909090909091|
|Argentina|    12|     GK|     ARMANI Franco|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16| 0.09090909090909091|

In [None]:
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+--------------------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|                   %|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+--------------------+
|Argentina|     6|     DF|    FAZIO Federico|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|                 0.0|
|Argentina|     1|     GK|     GUZMAN Nahuel|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|0.045454545454545456|
|Argentina|    16|     DF|       ROJO Marcos|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20| 0.09090909090909091|
|Argentina|    12|     GK|     ARMANI Franco|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16| 0.09090909090909091|
|Argentina|    23|     GK|CABALLERO Wilfredo|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981|     1981-09-28| 0.18181818181818182|
|Argentina|     9|     FW|   HIGUAIN Gonzalo|      HIGUAÍN|   Juventus FC (ITA)|   184|  75| 10| 12|1987|     1987-12-10| 0.22727272727272727|
|Argentina|     4|     DF|  ANSALDI Cristian|      ANSALDI|     Torino FC (ITA)|   181|  73| 20| 09|1986|     1986-09-20|  0.2727272727272727|
|Argentina|     2|     DF|   MERCADO Gabriel|      MERCADO|    Sevilla FC (ESP)|   181|  81| 18| 03|1987|     1987-03-18|  0.2727272727272727|
|Argentina|    17|     DF|  OTAMENDI Nicolas|     OTAMENDI|Manchester City F...|   181|  81| 12| 02|1988|     1988-02-12|  0.2727272727272727|
|Argentina|    13|     MF|  MEZA Maximiliano|         MEZA|CA Independiente ...|   180|  76| 15| 12|1992|     1992-12-15|  0.4090909090909091|
|Argentina|    11|     MF|    DI MARIA Angel|     DI MARÍA|Paris Saint-Germa...|   178|  75| 14| 02|1988|     1988-02-14| 0.45454545454545453|
|Argentina|    21|     FW|      DYBALA Paulo|       DYBALA|   Juventus FC (ITA)|   177|  73| 15| 11|1993|     1993-11-15|                 0.5|
|Argentina|    20|     MF|  LO CELSO Giovani|     LO CELSO|Paris Saint-Germa...|   177|  75| 09| 04|1996|     1996-04-09|                 0.5|
|Argentina|     5|     MF|      BIGLIA Lucas|       BIGLIA|      AC Milan (ITA)|   175|  73| 30| 01|1986|     1986-01-30|  0.5909090909090909|
|Argentina|     7|     MF|       BANEGA Ever|       BANEGA|    Sevilla FC (ESP)|   175|  73| 29| 06|1988|     1988-06-29|  0.5909090909090909|
|Argentina|    14|     DF| MASCHERANO Javier|   MASCHERANO|Hebei China Fortu...|   174|  73| 08| 06|1984|     1984-06-08|  0.6818181818181818|
|Argentina|    19|     FW|     AGUERO Sergio|       AGÜERO|Manchester City F...|   172|  74| 02| 06|1988|     1988-06-02|  0.7272727272727273|
|Argentina|     8|     DF|      ACUNA Marcos|        ACUÑA|   Sporting CP (POR)|   172|  77| 28| 10|1991|     1991-10-28|  0.7272727272727273|
|Argentina|    10|     FW|      MESSI Lionel|        MESSI|  FC Barcelona (ESP)|   170|  72| 24| 06|1987|     1987-06-24|  0.8181818181818182|
|Argentina|     3|     DF|TAGLIAFICO Nicolas|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|  0.8636363636363636|
|Argentina|    22|     MF|    PAVON Cristian|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|  0.8636363636363636|
|Argentina|    15|     MF|    LANZINI Manuel|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|  0.9545454545454546|
|Argentina|    18|     DF|    SALVIO Eduardo|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|  0.9545454545454546|
|Australia|    12|     GK|        JONES Brad|        JONES|Feyenoord Rotterd...|   193|  87| 19| 03|1982|     1982-03-19|                 0.0|
|Australia|     9|     FW|        JURIC Tomi|        JURIC|     FC Luzern (SUI)|   190|  80| 22| 07|1991|     1991-07-22|0.045454545454545456|
|Australia|     6|     DF|    JURMAN Matthew|       JURMAN|Suwon Samsung Blu...|   190|  83| 08| 12|1989|     1989-12-08|0.045454545454545456|
|Australia|    23|     MF|         ROGIC Tom|        ROGIC|     Celtic FC (SCO)|   189|  88| 16| 12|1992|     1992-12-16| 0.13636363636363635|
|Australia|    15|     MF|      JEDINAK Mile|      JEDINAK|Aston Villa FC (ENG)|   188|  78| 03| 08|1984|     1984-08-03| 0.18181818181818182|
|Australia|     2|     DF|     DEGENEK Milos|      DEGENEK|Yokohama F-Marino...|   187|  85| 28| 04|1994|     1994-04-28| 0.22727272727272727|
|Australia|    18|     GK|     VUKOVIC Danny|      VUKOVIC|      KRC Genk (BEL)|   187|  94| 27| 03|1985|     1985-03-27| 0.22727272727272727|
|Australia|     1|     GK|       RYAN Mathew|         RYAN|Brighton & Hove A...|   184|  82| 08| 04|1992|     1992-04-08|  0.3181818181818182|
|Australia|    20|     DF|   SAINSBURY Trent|    SAINSBURY|Grasshopper Club ...|   183|  76| 05| 01|1992|     1992-01-05| 0.36363636363636365|
|Australia|     7|     FW|     LECKIE Mathew|       LECKIE|    Hertha BSC (GER)|   181|  84| 04| 02|1991|     1991-02-04|  0.4090909090909091|
|Australia|    10|     FW|      KRUSE Robbie|        KRUSE|    VfL Bochum (GER)|   180|  66| 05| 10|1988|     1988-10-05| 0.45454545454545453|
|Australia|     4|     FW|        CAHILL Tim|       CAHILL|   Millwall FC (ENG)|   180|  76| 06| 12|1979|     1979-12-06| 0.45454545454545453|
|Australia|    22|     MF|    IRVINE Jackson|       IRVINE|  Hull City FC (ENG)|   180|  77| 07| 03|1993|     1993-03-07| 0.45454545454545453|
|Australia|     3|     DF|    MEREDITH James|     MEREDITH|   Millwall FC (ENG)|   179|  71| 05| 04|1988|     1988-04-05|  0.5909090909090909|
|Australia|    14|     FW|    MacLAREN Jamie|     MACLAREN|  Hibernian FC (SCO)|   178|  72| 29| 07|1993|     1993-07-29|  0.6363636363636364|
|Australia|     5|     DF|     MILLIGAN Mark|     MILLIGAN|    Al Ahli SC (KSA)|   178|  78| 04| 08|1985|     1985-08-04|  0.6363636363636364|
|Australia|    11|     FW|    NABBOUT Andrew|      NABBOUT|    Urawa Reds (JPN)|   178|  85| 17| 12|1992|     1992-12-17|  0.6363636363636364|
|Australia|    21|     FW|PETRATOS Dimitrios|     PETRATOS|Newcastle United ...|   176|  72| 10| 11|1992|     1992-11-10|  0.7727272727272727|
|Australia|     8|     MF|    LUONGO Massimo|       LUONGO|Queens Park Range...|   176|  76| 25| 09|1992|     1992-09-25|  0.7727272727272727|
|Australia|    13|     MF|        MOOY Aaron|         MOOY|Huddersfield Town...|   173|  72| 15| 09|1990|     1990-09-15|  0.8636363636363636|
|Australia|    17|     FW|     ARZANI Daniel|       ARZANI|Melbourne City FC...|   171|  73| 04| 01|1999|     1999-01-04|  0.9090909090909091|
|Australia|    16|     DF|       BEHICH Aziz|       BEHICH|     Bursaspor (TUR)|   170|  63| 16| 12|1990|     1990-12-16|  0.9545454545454546|
|Australia|    19|     DF|     RISDON Joshua|       RISDON|WS Wanderers FC (...|   169|  70| 27| 07|1992|     1992-07-27|                 1.0|
|  Belgium|     1|     GK|  COURTOIS Thibaut|     COURTOIS|    Chelsea FC (ENG)|   199|  91| 11| 05|1992|     1992-05-11|                 0.0|
|  Belgium|    13|     GK|     CASTEELS Koen|     CASTEELS| VfL Wolfsburg (GER)|   197|  86| 25| 06|1992|     1992-06-25|0.045454545454545456|
|  Belgium|     8|     MF| FELLAINI Marouane|     FELLAINI|Manchester United...|   194|  85| 22| 11|1987|     1987-11-22| 0.09090909090909091|
|  Belgium|    12|     GK|    MIGNOLET Simon|     MIGNOLET|  Liverpool FC (ENG)|   193|  87| 06| 03|1988|     1988-03-06| 0.13636363636363635|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+--------------------+

##### Window Function 5 - Divisão em ' N ' partes - ntile()

In [40]:
parte = Window.partitionBy('Selecao').orderBy(desc('Altura'))

df.withColumn('Par', ntile(5).over(parte)).show(50)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+---+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|Par|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+---+
|Argentina|     6|     DF|    FAZIO Federico|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|  1|
|Argentina|     1|     GK|     GUZMAN Nahuel|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|  1|
|Argentina|    16|     DF|       ROJO Marcos|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|  1|
|Argentina|    12|     GK|     ARMANI Franco|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|  1|
|Argentina|    23|     GK|CABALLERO Wilfredo|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981|     1981-09-

#### Window Analytic Functions (Funções analíticas)

In [42]:
df = df.drop('Nome_Fifa')

##### Window Function 6 - LAG / Degrau - lag()

In [48]:
degrau = Window.partitionBy('Selecao').orderBy(desc("Altura"))

df.withColumn('degrau', lag('Altura', 5).over(degrau)).show(50)

+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|degrau|
+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+------+
|Argentina|     6|     DF|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|  null|
|Argentina|     1|     GK|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|  null|
|Argentina|    16|     DF|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|  null|
|Argentina|    12|     GK|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|  null|
|Argentina|    23|     GK|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981|     1981-09-28|  null|
|Argentina|     9|     FW|      HIGUAÍN|   Juventus FC (ITA)|   184|  75| 10| 12|1987|     1987-12-10|   199|
|Argentina

##### Window Function 7 - Lead / Degrau - lead()

In [46]:
degrau = Window.partitionBy('Selecao').orderBy(desc("Altura"))

df.withColumn('degrau', lead('Altura').over(degrau)).show(50)

+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|degrau|
+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+------+
|Argentina|     6|     DF|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|   192|
|Argentina|     1|     GK|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|   189|
|Argentina|    16|     DF|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|   189|
|Argentina|    12|     GK|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|   186|
|Argentina|    23|     GK|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981|     1981-09-28|   184|
|Argentina|     9|     FW|      HIGUAÍN|   Juventus FC (ITA)|   184|  75| 10| 12|1987|     1987-12-10|   181|
|Argentina

#### Agregações

##### GroupBy + AGG 1

In [54]:
df.groupBy('Selecao').agg({'Altura':'avg'}).orderBy('avg(Altura)', ascending=False).show(50)

+--------------+------------------+
|       Selecao|       avg(Altura)|
+--------------+------------------+
|        Serbia|186.69565217391303|
|       Denmark| 186.6086956521739|
|       Germany| 185.7826086956522|
|        Sweden| 185.7391304347826|
|       Iceland|185.52173913043478|
|       Belgium|185.34782608695653|
|       Croatia| 185.2608695652174|
|       Nigeria|184.52173913043478|
|       IR Iran|184.47826086956522|
|        Russia| 184.3913043478261|
|       Senegal|183.65217391304347|
|        France|183.30434782608697|
|        Poland|183.17391304347825|
|       Tunisia|183.08695652173913|
|   Switzerland|182.91304347826087|
|       England| 182.7391304347826|
|       Morocco|182.69565217391303|
|        Panama|182.17391304347825|
|Korea Republic| 181.8695652173913|
|       Uruguay|181.04347826086956|
|         Egypt|             181.0|
|     Australia| 180.8695652173913|
|        Brazil| 180.7826086956522|
|      Colombia| 180.7826086956522|
|    Costa Rica|180.69565217

##### GroupBy + AGG 2

In [57]:
df.groupBy('Selecao').agg(max('Altura')).orderBy('max(Altura)', ascending=False).show(50)

+--------------+-----------+
|       Selecao|max(Altura)|
+--------------+-----------+
|       Croatia|        201|
|       Denmark|        200|
|     Argentina|        199|
|       Belgium|        199|
|        Sweden|        198|
|       Iceland|        198|
|        France|        197|
|Korea Republic|        197|
|       Nigeria|        197|
|        Panama|        197|
|       Senegal|        196|
|        Russia|        196|
|       Uruguay|        196|
|       England|        196|
|    Costa Rica|        196|
|        Poland|        195|
|        Serbia|        195|
|        Brazil|        195|
|       Germany|        195|
|         Spain|        194|
|         Egypt|        194|
|      Colombia|        194|
|       IR Iran|        194|
|     Australia|        193|
|   Switzerland|        192|
|  Saudi Arabia|        192|
|       Tunisia|        192|
|      Portugal|        191|
|       Morocco|        190|
|        Mexico|        190|
|          Peru|        189|
|         Japa

##### Where

In [59]:
df.where('Selecao = "Brazil"').where('Posicao = "DF"').show(25)

where((condicao 1) & | (condicao))

+-------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+
|Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|
+-------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+
| Brazil|    22|     DF|       FAGNER|SC Corinthians (BRA)|   168|  67| 11| 06|1989|     1989-06-11|
| Brazil|     6|     DF|  FILIPE LUIS|Atletico Madrid (...|   182|  73| 09| 08|1985|     1985-08-09|
| Brazil|    13|     DF|   MARQUINHOS|Paris Saint-Germa...|   183|  75| 14| 05|1994|     1994-05-14|
| Brazil|     3|     DF|      MIRANDA|FC Internazionale...|   186|  78| 07| 09|1984|     1984-09-07|
| Brazil|    14|     DF|       DANILO|Manchester City F...|   184|  78| 15| 07|1991|     1991-07-15|
| Brazil|     2|     DF|     T. SILVA|Paris Saint-Germa...|   183|  79| 22| 09|1984|     1984-09-22|
| Brazil|    12|     DF|      MARCELO|Real Madrid CF (ESP)|   174|  80| 12| 05|1988|     19

In [69]:
top1 = Window.partitionBy('Selecao').orderBy(desc("Altura"))

df.withColumn('Top', row_number().over(top1)).filter('Top = "1"').show(50)

+--------------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+---+
|       Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|Top|
+--------------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+---+
|     Argentina|     6|     DF|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|  1|
|     Australia|    12|     GK|        JONES|Feyenoord Rotterd...|   193|  87| 19| 03|1982|     1982-03-19|  1|
|       Belgium|     1|     GK|     COURTOIS|    Chelsea FC (ENG)|   199|  91| 11| 05|1992|     1992-05-11|  1|
|        Brazil|    16|     GK|       CASSIO|SC Corinthians (BRA)|   195|  92| 06| 06|1987|     1987-06-06|  1|
|      Colombia|    13|     DF|      Y. MINA|  FC Barcelona (ESP)|   194|  95| 23| 09|1994|     1994-09-23|  1|
|    Costa Rica|    19|     DF|    K. WASTON|Vancouver Whiteca...|   196|  87| 01| 01|1988|     1988-01-

##### Describe

In [60]:
df.describe().show()

+-------+---------+-----------------+-------+-------------+--------------------+-----------------+-----------------+------------------+------------------+------------------+
|summary|  Selecao|           Numero|Posicao|Nome Camiseta|                Time|           Altura|             Peso|               Dia|               Mes|               Ano|
+-------+---------+-----------------+-------+-------------+--------------------+-----------------+-----------------+------------------+------------------+------------------+
|  count|      736|              736|    736|          736|                 736|              736|              736|               736|               736|               736|
|   mean|     null|             12.0|   null|         null|                null|182.4076086956522|77.18885869565217|15.793478260869565|5.8790760869565215| 1990.110054347826|
| stddev|     null|6.637760461599851|   null|         null|                null|6.930924233929302|7.233778346883639| 8.76112382873

In [None]:
+-------+---------+-----------------+-------+-------------+--------------------+-----------------+-----------------+------------------+------------------+------------------+
|summary|  Selecao|           Numero|Posicao|Nome Camiseta|                Time|           Altura|             Peso|               Dia|               Mes|               Ano|
+-------+---------+-----------------+-------+-------------+--------------------+-----------------+-----------------+------------------+------------------+------------------+
|  count|      736|              736|    736|          736|                 736|              736|              736|               736|               736|               736|
|   mean|     null|             12.0|   null|         null|                null|182.4076086956522|77.18885869565217|15.793478260869565|5.8790760869565215| 1990.110054347826|
| stddev|     null|6.637760461599851|   null|         null|                null|6.930924233929302|7.233778346883639| 8.761123828732469|3.3782493094684387|3.9074472063626775|
|    min|Argentina|                1|     DF|    A. ASHRAF|    1. FC Köln (GER)|              165|               59|                01|                01|              1973|
|    max|  Uruguay|               23|     MF|     ŽIVKOVIĆ|Étoile du Sahel (...|              201|               99|                31|                12|              1999|
+-------+---------+-----------------+-------+-------------+--------------------+-----------------+-----------------+------------------+------------------+------------------+

In [61]:
df.where('Selecao = "Brazil"').describe().show()

+-------+-------+-----------------+-------+-------------+--------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|Selecao|           Numero|Posicao|Nome Camiseta|                Time|           Altura|             Peso|              Dia|              Mes|              Ano|
+-------+-------+-----------------+-------+-------------+--------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|     23|               23|     23|           23|                  23|               23|               23|               23|               23|               23|
|   mean|   null|             12.0|   null|         null|                null|180.7826086956522|76.56521739130434|11.26086956521739|6.130434782608695|1989.391304347826|
| stddev|   null|6.782329983125267|   null|         null|                null|7.354383490255254|8.239737898283606|6.876953549252538|2.784769418006175|3.499

In [None]:
+-------+-------+-----------------+-------+-------------+--------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|Selecao|           Numero|Posicao|Nome Camiseta|                Time|           Altura|             Peso|              Dia|              Mes|              Ano|
+-------+-------+-----------------+-------+-------------+--------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|     23|               23|     23|           23|                  23|               23|               23|               23|               23|               23|
|   mean|   null|             12.0|   null|         null|                null|180.7826086956522|76.56521739130434|11.26086956521739|6.130434782608695|1989.391304347826|
| stddev|   null|6.782329983125267|   null|         null|                null|7.354383490255254|8.239737898283606|6.876953549252538|2.784769418006175|3.499858833968506|
|    min| Brazil|                1|     DF|    A. BECKER|       AS Roma (ITA)|              168|               64|               02|               01|             1984|
|    max| Brazil|               23|     MF|      WILLIAN|SC Corinthians (BRA)|              195|               92|               25|               10|             1997|
+-------+-------+-----------------+-------+-------------+--------------------+-----------------+-----------------+-----------------+-----------------+-----------------+

##### Window Function 8 - Função de agregação usando Window Function

In [65]:
parametro = Window.partitionBy('Selecao').orderBy(desc("Altura"))
parametro2 = Window.partitionBy('Selecao')

df.withColumn('linhax', row_number().over(parametro))\
\
.withColumn('media', avg('Altura').over(parametro2))\
.withColumn('max', max('Altura').over(parametro2))\
.withColumn('min', min('Altura').over(parametro2))\
.filter('linhax = "1"').select('Selecao', 'media', 'max', 'min')\
.orderBy('media', ascending=False).show(50)

+--------------+------------------+---+---+
|       Selecao|             media|max|min|
+--------------+------------------+---+---+
|        Serbia|186.69565217391303|195|169|
|       Denmark| 186.6086956521739|200|171|
|       Germany| 185.7826086956522|195|176|
|        Sweden| 185.7391304347826|198|177|
|       Iceland|185.52173913043478|198|170|
|       Belgium|185.34782608695653|199|169|
|       Croatia| 185.2608695652174|201|172|
|       Nigeria|184.52173913043478|197|172|
|       IR Iran|184.47826086956522|194|177|
|        Russia| 184.3913043478261|196|173|
|       Senegal|183.65217391304347|196|173|
|        France|183.30434782608697|197|168|
|        Poland|183.17391304347825|195|172|
|       Tunisia|183.08695652173913|192|170|
|   Switzerland|182.91304347826087|192|165|
|       England| 182.7391304347826|196|170|
|       Morocco|182.69565217391303|190|167|
|        Panama|182.17391304347825|197|165|
|Korea Republic| 181.8695652173913|197|170|
|       Uruguay|181.043478260869

In [None]:
+--------------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+------+------------------+---+---+
|       Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|linhax|             media|max|min|
+--------------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+------+------------------+---+---+
|        Serbia|    15|     DF|   MILENKOVIĆ|ACF Fiorentina (ITA)|   195|  90| 12| 10|1997|     1997-10-12|     1|186.69565217391303|195|169|
|       Denmark|     3|     DF|  VESTERGAARD|VfL Borussia Mönc...|   200|  98| 03| 08|1992|     1992-08-03|     1| 186.6086956521739|200|171|
|       Germany|    15|     DF|         SÜLE|FC Bayern München...|   195|  89| 03| 09|1995|     1995-09-03|     1| 185.7826086956522|195|176|
|        Sweden|     1|     GK|        OLSEN|  FC Kobenhavn (DEN)|   198|  89| 08| 01|1990|     1990-01-08|     1| 185.7391304347826|198|177|
|       Iceland|    12|     GK|       SCHRAM|   Roskilde BK (DEN)|   198|  92| 19| 01|1995|     1995-01-19|     1|185.52173913043478|198|170|
|       Belgium|     1|     GK|     COURTOIS|    Chelsea FC (ENG)|   199|  91| 11| 05|1992|     1992-05-11|     1|185.34782608695653|199|169|
|       Croatia|    12|     GK|   L. KALINIĆ|      KAA Gent (BEL)|   201|  96| 03| 04|1990|     1990-04-03|     1| 185.2608695652174|201|172|
|       Nigeria|    13|     FW|      NWANKWO|    FC Crotone (ITA)|   197|  89| 07| 05|1992|     1992-05-07|     1|184.52173913043478|197|172|
|       IR Iran|     1|     GK|A. BEIRANVAND| Persepolis FC (IRN)|   194|  85| 21| 09|1992|     1992-09-21|     1|184.47826086956522|194|177|
|        Russia|    22|     FW|       DZYUBA|FC Arsenal Tula (...|   196|  90| 22| 08|1988|     1988-08-22|     1| 184.3913043478261|196|173|
|       Senegal|    23|     GK|        GOMIS|  SPAL Ferrara (ITA)|   196|  80| 05| 09|1993|     1993-09-05|     1|183.65217391304347|196|173|
|        France|    15|     MF|       NZONZI|    Sevilla FC (ESP)|   197|  87| 15| 12|1988|     1988-12-15|     1|183.30434782608697|197|168|
|        Poland|     1|     GK|     SZCZESNY|   Juventus FC (ITA)|   195|  84| 18| 04|1990|     1990-04-18|     1|183.17391304347825|195|172|
|       Tunisia|     1|     GK| BEN MUSTAPHA|  Al Shabab FC (KSA)|   192|  85| 01| 07|1989|     1989-07-01|     1|183.08695652173913|192|170|
|   Switzerland|    20|     DF|      DJOUROU|Antalyaspor AS (TUR)|   192|  90| 18| 01|1987|     1987-01-18|     1|182.91304347826087|192|165|
|       England|    13|     GK|      BUTLAND| Stoke City FC (ENG)|   196|  96| 10| 03|1993|     1993-03-10|     1| 182.7391304347826|196|170|
|       Morocco|     6|     DF|        SAISS|Wolverhampton Wan...|   190|  76| 26| 03|1990|     1990-03-26|     1|182.69565217391303|190|167|
|        Panama|    22|     GK|    RODRIGUEZ|San Francisco FC ...|   197|  83| 05| 08|1990|     1990-08-05|     1|182.17391304347825|197|165|
|Korea Republic|     9|     FW|      S W KIM|Jeonbuk Hyundai (...|   197|  97| 14| 04|1988|     1988-04-14|     1| 181.8695652173913|197|170|
|       Uruguay|    19|     DF|       COATES|   Sporting CP (POR)|   196|  89| 07| 10|1990|     1990-10-07|     1|181.04347826086956|196|168|
|         Egypt|     6|     DF|    A. HEGAZY|West Bromwich Alb...|   194|  95| 25| 01|1991|     1991-01-25|     1|             181.0|194|169|
|     Australia|    12|     GK|        JONES|Feyenoord Rotterd...|   193|  87| 19| 03|1982|     1982-03-19|     1| 180.8695652173913|193|169|
|        Brazil|    16|     GK|       CASSIO|SC Corinthians (BRA)|   195|  92| 06| 06|1987|     1987-06-06|     1| 180.7826086956522|195|168|
|      Colombia|    13|     DF|      Y. MINA|  FC Barcelona (ESP)|   194|  95| 23| 09|1994|     1994-09-23|     1| 180.7826086956522|194|169|
|    Costa Rica|    19|     DF|    K. WASTON|Vancouver Whiteca...|   196|  87| 01| 01|1988|     1988-01-01|     1|180.69565217391303|196|172|
|         Spain|     3|     DF|        PIQUÉ|  FC Barcelona (ESP)|   194|  85| 02| 02|1987|     1987-02-02|     1|179.91304347826087|194|170|
|        Mexico|     5|     DF|     D. REYES|      FC Porto (POR)|   190|  77| 19| 09|1992|     1992-09-19|     1| 179.7826086956522|190|166|
|      Portugal|     6|     DF|        FONTE|Dalian Yifang FC ...|   191|  84| 22| 12|1983|     1983-12-22|     1| 179.7391304347826|191|168|
|         Japan|    22|     DF|      YOSHIDA|Southampton FC (ENG)|   189|  78| 24| 08|1988|     1988-08-24|     1| 178.7826086956522|189|168|
|     Argentina|     6|     DF|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|     1|178.43478260869566|199|167|
|  Saudi Arabia|    12|     MF|        KANNO|  Al Hilal SFC (KSA)|   192|  73| 22| 09|1994|     1994-09-22|     1|177.65217391304347|192|165|
|          Peru|     1|     GK|      GALLESE|CD Tiburones Rojo...|   189|  79| 23| 02|1990|     1990-02-23|     1| 177.6086956521739|189|169|
+--------------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+------+------------------+---+---+

In [66]:
parametro = Window.partitionBy('Selecao').orderBy(desc("Altura"))
parametro2 = Window.partitionBy('Selecao')

df.withColumn('linhax', row_number().over(parametro))\
\
.withColumn('media', avg('Altura').over(parametro2))\
.withColumn('max', max('Altura').over(parametro2))\
.withColumn('min', min('Altura').over(parametro2))\
.filter('linhax = "1"').select('Selecao', 'media', 'max', 'min')\
.orderBy('media', ascending=False).show(50)

<pyspark.sql.window.WindowSpec at 0x14aa7cb5160>