In [1]:
# enable pyspark
import findspark
findspark.init()

In [2]:
'''
Scripts instantiates a SparkSession locally with 8 worker threads.
'''
appName = "Remove duplicate rows."
master = "local[8]"
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
# Create Spark session
conf = SparkConf().setMaster(master)
spark = SparkSession.builder.config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()
# INFO/WARN/DEBUG
# https://kontext.tech/column/spark/457/tutorial-turn-off-info-logs-in-spark
spark.sparkContext.setLogLevel("INFO")

In [3]:

# import pyspark
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import expr
# spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

data = [("James", "Sales", 3000), \
    ("Michael", "Sales", 4600), \
    ("Robert", "Sales", 4100), \
    ("Maria", "Finance", 3000), \
    ("James", "Sales", 3000), \
    ("Scott", "Finance", 3300), \
    ("Jen", "Finance", 3900), \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000), \
    ("Saif", "Sales", 4100) \
  ]

for i in range(100):
    data.append(("James", "Sales", 3000))
    data.append(("Michael", "Sales", 4600))
    data.append(("Robert", "Sales", 4100))
    data.append(("Maria", "Finance", 3000))
    data.append(("James", "Sales", 3000))
    data.append(("Scott", "Finance", 3300))
    data.append(("Jen", "Finance", 3900))
    data.append(("Jeff", "Marketing", 3000))
    data.append(("Kumar", "Marketing", 2000))
    data.append(("Saif", "Sales", 4100)) 

columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)
df.count()


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+
only showing top 20 rows



1010

### Distinct Rows (By Comparing All Columns)

In [4]:
distinctDF = df.distinct()
print(f'Distinct count: {distinctDF.count()}')
distinctDF.show(truncate=False)


Distinct count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Jen          |Finance   |3900  |
|Michael      |Sales     |4600  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Jeff         |Marketing |3000  |
|Saif         |Sales     |4100  |
|Maria        |Finance   |3000  |
+-------------+----------+------+



### Alternatively, you can also run dropDuplicates() function which returns a new DataFrame after removing duplicate rows.

In [5]:
df2 = df.dropDuplicates()
print(f"Distinct count: {df2.count()}")
df2.show(truncate=False)


Distinct count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Jen          |Finance   |3900  |
|Michael      |Sales     |4600  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Jeff         |Marketing |3000  |
|Saif         |Sales     |4100  |
|Maria        |Finance   |3000  |
+-------------+----------+------+



### Distinct based on Selected Columns
dropDuplicates() function which takes multiple columns to eliminate duplicates. Drops duplicate rows based on selected columns.

In [6]:
dropDisDF = df.dropDuplicates(["department","salary"])
print(f"Distinct count based on columns department and salary: {dropDisDF.count()}")
dropDisDF.show(truncate=False)

Distinct count based on columns department and salary: 8
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Jen          |Finance   |3900  |
|Maria        |Finance   |3000  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|James        |Sales     |3000  |
|Jeff         |Marketing |3000  |
+-------------+----------+------+

