In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder. \
appName("Removing Duplicates"). \
getOrCreate()

In [3]:
people_list = [
    (1, "Hoa", 34, "50M"),
    (1, "Hoa", 34, "50M"),
    (1, "Hoa", 34, "60M"),
    (1, "Nam", 26, "30M"),
    (2, "Nam", 26, "40M")
]

header = ["id", "name", "age", "salary"]

In [4]:
df = spark.createDataFrame(people_list).toDF(*header)
df.show()

+---+----+---+------+
| id|name|age|salary|
+---+----+---+------+
|  1| Hoa| 34|   50M|
|  1| Hoa| 34|   50M|
|  1| Hoa| 34|   60M|
|  1| Nam| 26|   30M|
|  2| Nam| 26|   40M|
+---+----+---+------+



In [5]:
df_distinct = df.distinct()
df_distinct.show()

+---+----+---+------+
| id|name|age|salary|
+---+----+---+------+
|  1| Hoa| 34|   50M|
|  1| Hoa| 34|   60M|
|  2| Nam| 26|   40M|
|  1| Nam| 26|   30M|
+---+----+---+------+



In [6]:
df_dropDup_1 = df.dropDuplicates(["id"])
df_dropDup_1.show()

+---+----+---+------+
| id|name|age|salary|
+---+----+---+------+
|  1| Hoa| 34|   50M|
|  2| Nam| 26|   40M|
+---+----+---+------+



In [7]:
df_dropDup_2 = df.dropDuplicates(["id", "name"])
df_dropDup_2.show()

+---+----+---+------+
| id|name|age|salary|
+---+----+---+------+
|  1| Hoa| 34|   50M|
|  1| Nam| 26|   30M|
|  2| Nam| 26|   40M|
+---+----+---+------+



In [None]:
from pyspark.sql.window import Window, WindowSpec
from pyspark.sql.functions import rank, dense_rank, percent_rank, row_number, desc, asc

In [9]:
window = Window.partitionBy("id").orderBy(desc("age"), asc("salary"))

In [10]:
df_window = df.withColumn("row_number", row_number().over(window))
df_window.show()

+---+----+---+------+----------+
| id|name|age|salary|row_number|
+---+----+---+------+----------+
|  1| Hoa| 34|   50M|         1|
|  1| Hoa| 34|   50M|         2|
|  1| Hoa| 34|   60M|         3|
|  1| Nam| 26|   30M|         4|
|  2| Nam| 26|   40M|         1|
+---+----+---+------+----------+



In [11]:
df_rowNum1 = df_window.where("row_number = 1").drop("row_number")
df_rowNum1.show()

+---+----+---+------+
| id|name|age|salary|
+---+----+---+------+
|  1| Hoa| 34|   50M|
|  2| Nam| 26|   40M|
+---+----+---+------+

