In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("when otherwise question").getOrCreate()

In [3]:
# Sample data
data = [
    ("A1", "ID1", 1000),
    ("A1", "ID2", 1500),
    ("A2", "ID3", 2000),
    ("A3", "ID4", 500),
    ("A3", "ID5", 700),
    ("A3", "ID6", 800),
    ("A4", "ID7", 1200)
]

# Create DataFrame
columns = ["Account_No", "Customer_ID", "Amount"]

In [4]:
df = spark.createDataFrame(data, columns)
df.show()

+----------+-----------+------+
|Account_No|Customer_ID|Amount|
+----------+-----------+------+
|        A1|        ID1|  1000|
|        A1|        ID2|  1500|
|        A2|        ID3|  2000|
|        A3|        ID4|   500|
|        A3|        ID5|   700|
|        A3|        ID6|   800|
|        A4|        ID7|  1200|
+----------+-----------+------+



In [5]:
df2 = df.groupBy("Account_No").agg(max("Amount").alias("Max_Amount"))
df2.show()

+----------+----------+
|Account_No|Max_Amount|
+----------+----------+
|        A1|      1500|
|        A2|      2000|
|        A3|       800|
|        A4|      1200|
+----------+----------+



In [6]:
df3 = df.groupBy("Account_No").agg(count("Customer_ID").alias("Acocunt_count"))\
                .withColumn("Account status", when(col("Acocunt_count") > 1, "Joint").otherwise("IND"))
df3.show()

+----------+-------------+--------------+
|Account_No|Acocunt_count|Account status|
+----------+-------------+--------------+
|        A1|            2|         Joint|
|        A2|            1|           IND|
|        A3|            3|         Joint|
|        A4|            1|           IND|
+----------+-------------+--------------+



In [7]:
data = [
('Mahesh', 'A1', 100),
('Kishor' , 'A1', 200),
('Saurabh' , 'A2', 411),
('Chetan', 'A3', 744)
]


column = ['name', 'accountNumber', 'amount']

In [8]:
df_1 = spark.createDataFrame(data, column)
df_1.show()

+-------+-------------+------+
|   name|accountNumber|amount|
+-------+-------------+------+
| Mahesh|           A1|   100|
| Kishor|           A1|   200|
|Saurabh|           A2|   411|
| Chetan|           A3|   744|
+-------+-------------+------+



In [9]:
df4 = df_1.groupBy("accountNumber").agg(countDistinct("name").alias("Acocunt_count"))\
                .withColumn("Account status", when(col("Acocunt_count") > 1, "Joint").otherwise("IND"))
df4.show()

+-------------+-------------+--------------+
|accountNumber|Acocunt_count|Account status|
+-------------+-------------+--------------+
|           A2|            1|           IND|
|           A3|            1|           IND|
|           A1|            2|         Joint|
+-------------+-------------+--------------+

