In [2]:
# https://www.youtube.com/watch?v=xw4a9qbOh-Q

from pyspark.sql import SparkSession

spark = SparkSession \
   .builder  \
   .appName("Aula Interativa 1 - Engenharia de Dados - Apache Spark") \
   .getOrCreate()

df = spark.read.csv("arquivos/aula1-2-duplicates.csv", header=True, inferSchema=True)

# Em projetos de migração de dados, duplicatas ao longo do processo são comuns
df.show()

+---+-----+--------------+-------------------+
| id| nome|        cidade|               data|
+---+-----+--------------+-------------------+
|  1|pedro|    uberlandia|2021-01-01 00:00:00|
|  1|pedro|       uberaba|2022-01-01 00:00:00|
|  1|pedro|belo horizonte|2023-01-01 00:00:00|
|  1|pedro|belo horizonte|2023-01-01 00:00:00|
|  2|maria|       niteroi|2021-01-01 00:00:00|
|  2|maria|rio de janeiro|2023-01-01 00:00:00|
|  2|maria|   nova iguacu|2021-01-01 00:00:00|
|  3| josé|     são paulo|2023-01-01 00:00:00|
|  3| josé|     são paulo|2021-01-01 00:00:00|
|  3| josé|     são paulo|2022-01-01 00:00:00|
+---+-----+--------------+-------------------+



In [3]:
# Não resolve!
df.drop_duplicates().show()

+---+-----+--------------+-------------------+
| id| nome|        cidade|               data|
+---+-----+--------------+-------------------+
|  1|pedro|       uberaba|2022-01-01 00:00:00|
|  2|maria|       niteroi|2021-01-01 00:00:00|
|  1|pedro|belo horizonte|2023-01-01 00:00:00|
|  2|maria|   nova iguacu|2021-01-01 00:00:00|
|  3| josé|     são paulo|2022-01-01 00:00:00|
|  3| josé|     são paulo|2021-01-01 00:00:00|
|  1|pedro|    uberlandia|2021-01-01 00:00:00|
|  2|maria|rio de janeiro|2023-01-01 00:00:00|
|  3| josé|     são paulo|2023-01-01 00:00:00|
+---+-----+--------------+-------------------+



In [4]:
# ALTERNATIVA 1:
# drop_duplicates
from pyspark.sql.functions import col

# Primeira data é mantida!
df.orderBy(col("data").desc()).dropDuplicates(["id"]).show()

+---+-----+--------------+-------------------+
| id| nome|        cidade|               data|
+---+-----+--------------+-------------------+
|  1|pedro|belo horizonte|2023-01-01 00:00:00|
|  2|maria|rio de janeiro|2023-01-01 00:00:00|
|  3| josé|     são paulo|2023-01-01 00:00:00|
+---+-----+--------------+-------------------+



In [7]:
# ALTERNATIVA 2:
# window functions
# Window functions operate on a group of rows
# row_number() is a window function

from pyspark.sql.window import Window
from pyspark.sql.functions import *
df = df.withColumn("rowNumber",row_number().over(Window.partitionBy("id").orderBy(col("data").desc())))

df.show()

+---+-----+--------------+-------------------+---------+
| id| nome|        cidade|               data|rowNumber|
+---+-----+--------------+-------------------+---------+
|  1|pedro|belo horizonte|2023-01-01 00:00:00|        1|
|  1|pedro|belo horizonte|2023-01-01 00:00:00|        2|
|  1|pedro|       uberaba|2022-01-01 00:00:00|        3|
|  1|pedro|    uberlandia|2021-01-01 00:00:00|        4|
|  2|maria|rio de janeiro|2023-01-01 00:00:00|        1|
|  2|maria|       niteroi|2021-01-01 00:00:00|        2|
|  2|maria|   nova iguacu|2021-01-01 00:00:00|        3|
|  3| josé|     são paulo|2023-01-01 00:00:00|        1|
|  3| josé|     são paulo|2022-01-01 00:00:00|        2|
|  3| josé|     são paulo|2021-01-01 00:00:00|        3|
+---+-----+--------------+-------------------+---------+



In [9]:
df.filter("rownumber=1").show()

+---+-----+--------------+-------------------+---------+
| id| nome|        cidade|               data|rowNumber|
+---+-----+--------------+-------------------+---------+
|  1|pedro|belo horizonte|2023-01-01 00:00:00|        1|
|  2|maria|rio de janeiro|2023-01-01 00:00:00|        1|
|  3| josé|     são paulo|2023-01-01 00:00:00|        1|
+---+-----+--------------+-------------------+---------+



In [10]:
df.filter("rownumber > 1").show()

+---+-----+--------------+-------------------+---------+
| id| nome|        cidade|               data|rowNumber|
+---+-----+--------------+-------------------+---------+
|  1|pedro|belo horizonte|2023-01-01 00:00:00|        2|
|  1|pedro|       uberaba|2022-01-01 00:00:00|        3|
|  1|pedro|    uberlandia|2021-01-01 00:00:00|        4|
|  2|maria|       niteroi|2021-01-01 00:00:00|        2|
|  2|maria|   nova iguacu|2021-01-01 00:00:00|        3|
|  3| josé|     são paulo|2022-01-01 00:00:00|        2|
|  3| josé|     são paulo|2021-01-01 00:00:00|        3|
+---+-----+--------------+-------------------+---------+

