In [0]:
"""
From the dup email ids (case sensitive), extract only emails with lowercase if duplicate emails exists.
Expected Output: employee_id      107 104 102 105 101 103
"""

from pyspark.sql.functions import *
from pyspark.sql.window import Window

employees = spark.createDataFrame([
    (101,'Liam Alton', 'li.al@abc.com'),
    (102,'Josh Day', 'jo.da@abc.com'),
    (103,'Sean Mann', 'se.ma@abc.com'),
    (104,'Evan Blake', 'ev.bl@abc.com'),
    (105,'Toby Scott', 'jo.da@abc.com'),
    (106,'Anjali Chouhan', 'JO.DA@ABC.COM'),
    (107,'Ankit Bansal', 'AN.BA@ABC.COM')
], ["employee_id", "employee_name", "email_id"])

employees.show(truncate=False)
employees.printSchema()

+-----------+--------------+-------------+
|employee_id|employee_name |email_id     |
+-----------+--------------+-------------+
|101        |Liam Alton    |li.al@abc.com|
|102        |Josh Day      |jo.da@abc.com|
|103        |Sean Mann     |se.ma@abc.com|
|104        |Evan Blake    |ev.bl@abc.com|
|105        |Toby Scott    |jo.da@abc.com|
|106        |Anjali Chouhan|JO.DA@ABC.COM|
|107        |Ankit Bansal  |AN.BA@ABC.COM|
+-----------+--------------+-------------+

root
 |-- employee_id: long (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- email_id: string (nullable = true)



In [0]:
employees.withColumn("lowercase_email_id", lower(col("email_id"))) \
    .withColumn("ascii_email_id", ascii(col("email_id"))) \
    .withColumn("rn", rank().over(Window.partitionBy("lowercase_email_id").orderBy(desc(col("ascii_email_id"))))) \
    .filter(col("rn") == 1) \
    .select("employee_id", "employee_name", "email_id") \
    .show(truncate=False)

+-----------+-------------+-------------+
|employee_id|employee_name|email_id     |
+-----------+-------------+-------------+
|107        |Ankit Bansal |AN.BA@ABC.COM|
|104        |Evan Blake   |ev.bl@abc.com|
|102        |Josh Day     |jo.da@abc.com|
|105        |Toby Scott   |jo.da@abc.com|
|101        |Liam Alton   |li.al@abc.com|
|103        |Sean Mann    |se.ma@abc.com|
+-----------+-------------+-------------+

