In [0]:
"""
Spotify SQL Interview Problem | Top 5 Artists | Aggregation and Window Functions in SQL
Write a sql query to determine top 5 artists who appear in the top 10 of global_song_rank table the highest number of times.
"""
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

spark = SparkSession.builder.getOrCreate()

artists = spark.createDataFrame(
    [
        (101,'Ed Sheeran','Warner Music'),
        (120,'Drake	Warner','Music Group'),
        (125,'Bad Bunny','Rimas Entertainment'),
    ]
).toDF("artist_id", "artist_name", "label_owner")

songs = spark.createDataFrame(
    [
        (55511,101,'Perfect'),
        (45202,101,'Shape of You'),
        (22222,120,'One Dance'),
        (19960,120,'Hotline Bling')
    ]
).toDF("song_id","artist_id","name")

global_song_rank = spark.createDataFrame(
    [
        (1,45202,5),
        (3,45202,2),
        (1,19960,3),
        (9,19960,15)
    ]
).toDF("day","song_id","rank")

artists.show()
songs.show()
global_song_rank.show()


+---------+-------------+-------------------+
|artist_id|  artist_name|        label_owner|
+---------+-------------+-------------------+
|      101|   Ed Sheeran|       Warner Music|
|      120|Drake\tWarner|        Music Group|
|      125|    Bad Bunny|Rimas Entertainment|
+---------+-------------+-------------------+

+-------+---------+-------------+
|song_id|artist_id|         name|
+-------+---------+-------------+
|  55511|      101|      Perfect|
|  45202|      101| Shape of You|
|  22222|      120|    One Dance|
|  19960|      120|Hotline Bling|
+-------+---------+-------------+

+---+-------+----+
|day|song_id|rank|
+---+-------+----+
|  1|  45202|   5|
|  3|  45202|   2|
|  1|  19960|   3|
|  9|  19960|  15|
+---+-------+----+



In [0]:
artists.join(songs, on="artist_id", how="inner") \
    .join(global_song_rank, on="song_id", how="inner") \
    .filter(col("rank") <= 10) \
    .groupBy("artist_id").agg(count("artist_id").alias("num_of_appearances")) \
    .withColumn("dr", dense_rank().over(Window.orderBy(desc("num_of_appearances")))) \
    .filter(col("dr") <= 5) \
    .drop("dr", "num_of_appearances") \
    .join(artists, on="artist_id", how="inner") \
    .drop("label_owner") \
    .show()

+---------+-------------+
|artist_id|  artist_name|
+---------+-------------+
|      101|   Ed Sheeran|
|      120|Drake\tWarner|
+---------+-------------+

