In [30]:
"""
3. Adobe Interesting SQL Interview Question | Solving Using 2 Approaches | Data Analytics
https://lnkd.in/g_-_9ymd

For every customer that bought Photoshop return a list of the customers and the total spent on all the products 
except for Photoshop products.

Example: Customer_id 123 has Photoshop and other products purchased, we need to exclude Photoshop in total revenue.
If a customer does not use Photoshop at all then exclude such customers.
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

data = [    
    ( 123, 'Premier Pro', 100), 
    ( 123, 'Photoshop', 50),
    ( 123, 'After Effects', 50),
    ( 234, 'Illustrator', 200),
    ( 234, 'Pro', 100)
]

schema = StructType(
    [
        StructField("customer_id", IntegerType()),
        StructField("product", StringType()),
        StructField("revenue", IntegerType())
    ]
)

df = spark.createDataFrame(data=data, schema=schema)

df.show()
df.printSchema()


+-----------+-------------+-------+
|customer_id|      product|revenue|
+-----------+-------------+-------+
|        123|  Premier Pro|    100|
|        123|    Photoshop|     50|
|        123|After Effects|     50|
|        234|  Illustrator|    200|
|        234|          Pro|    100|
+-----------+-------------+-------+

root
 |-- customer_id: integer (nullable = true)
 |-- product: string (nullable = true)
 |-- revenue: integer (nullable = true)



In [33]:
df1 = df

df2 = df.filter(col("product") == "Photoshop").withColumnRenamed("customer_id", "customer_id_1")\
    .withColumnRenamed("product", "product_1")\
    .withColumnRenamed("revenue", "revenue_1")

df2.join(df1, df2["customer_id_1"]==df1["customer_id"], "inner")\
    .filter(col("product") != "Photoshop")\
    .groupBy("customer_id").agg(sum("revenue").alias("revenue"))\
    .show()


[Stage 88:>                                                         (0 + 4) / 4]

+-----------+-------+
|customer_id|revenue|
+-----------+-------+
|        123|    150|
+-----------+-------+



                                                                                