# Dynamic Partitioning Pruning
- Pruning partitions at runtime
- Problem Statement: Analyse the listening activity of users on the release date of a song on/after `2020-01-01`

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [3]:
import cml.data_v1 as cmldata

# Sample in-code customization of spark configurations
#from pyspark import SparkContext
#SparkContext.setSystemProperty('spark.executor.cores', '1')
#SparkContext.setSystemProperty('spark.executor.memory', '2g')

CONNECTION_NAME = "paul-aug26-aw-dl"
conn = cmldata.get_connection(CONNECTION_NAME)
spark = conn.get_spark_session()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Setting spark.hadoop.yarn.resourcemanager.principal to pauldefusco


Spark Application Id:spark-af2de29d4f314ed69d837729cb980ba6


In [6]:
df_listening_actv = spark.read.csv("/home/cdsw/data/partitioning/raw/Spotify_Listening_Activity.csv", header=True, inferSchema=True)
df_listening_actv = (
    df_listening_actv
    .withColumnRenamed("listen_date", "listen_time")
    .withColumn("listen_date", F.to_date("listen_time", "yyyy-MM-dd HH:mm:ss.SSSSSS"))
)

# Partitioning listening activity by the listen date

(
    df_listening_actv
    .write
    .partitionBy("listen_date")
    .mode("overwrite")
    .parquet("s3a://paul-aug26-buk-a3c2b50a/data/pdefusco/data/listening_activity_pt")
)

                                                                                

In [7]:
df_listening_actv_pt = spark.read.parquet("s3a://paul-aug26-buk-a3c2b50a/data/pdefusco/data/listening_activity_pt")
df_listening_actv_pt.show(5, False)

                                                                                

+-----------+-------+--------------------------+---------------+-----------+
|activity_id|song_id|listen_time               |listen_duration|listen_date|
+-----------+-------+--------------------------+---------------+-----------+
|4456       |16     |2023-07-18 10:15:47.023264|151            |2023-07-18 |
|4457       |65     |2023-07-18 10:15:47.023264|181            |2023-07-18 |
|4458       |60     |2023-07-18 10:15:47.023264|280            |2023-07-18 |
|4459       |3      |2023-07-18 10:15:47.023264|249            |2023-07-18 |
|4460       |45     |2023-07-18 10:15:47.023264|130            |2023-07-18 |
+-----------+-------+--------------------------+---------------+-----------+
only showing top 5 rows



In [8]:
df_songs = spark.read.csv("/home/cdsw/data/partitioning/raw/Spotify_Songs.csv", header=True, inferSchema=True)
df_songs.printSchema()

[Stage 12:>                                                         (0 + 1) / 1]

root
 |-- song_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: integer (nullable = true)
 |-- release_date: timestamp (nullable = true)



                                                                                

In [9]:
df_songs = (
    df_songs
    .withColumnRenamed("release_date", "release_datetime")
    .withColumn("release_date", F.to_date("release_datetime", "yyyy-MM-dd HH:mm:ss.SSSSSS"))
)
df_songs.show(5, False)
df_songs.printSchema()

[Stage 13:>                                                         (0 + 1) / 1]

+-------+------+---------+--------------------------+------------+
|song_id|title |artist_id|release_datetime          |release_date|
+-------+------+---------+--------------------------+------------+
|1      |Song_1|2        |2021-10-15 10:15:47.006571|2021-10-15  |
|2      |Song_2|45       |2020-12-07 10:15:47.006588|2020-12-07  |
|3      |Song_3|25       |2022-07-11 10:15:47.006591|2022-07-11  |
|4      |Song_4|25       |2019-03-09 10:15:47.006593|2019-03-09  |
|5      |Song_5|26       |2019-09-07 10:15:47.006596|2019-09-07  |
+-------+------+---------+--------------------------+------------+
only showing top 5 rows

root
 |-- song_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: integer (nullable = true)
 |-- release_datetime: timestamp (nullable = true)
 |-- release_date: date (nullable = true)



                                                                                

In [10]:
# Pick songs released in 2020
df_selected_songs = df_songs.filter(F.col("release_date") > F.lit("2019-12-31"))


df_listening_actv_of_selected_songs = df_listening_actv_pt.join(
    df_selected_songs, 
    on=(df_songs.release_date == df_listening_actv_pt.listen_date) & (df_songs.song_id == df_listening_actv_pt.song_id), 
    how="inner"
)

# df_listening_actv_of_selected_songs.explain(True)

In [12]:
df_listening_actv_of_selected_songs.show()



+-----------+-------+--------------------+---------------+-----------+-------+-------+---------+--------------------+------------+
|activity_id|song_id|         listen_time|listen_duration|listen_date|song_id|  title|artist_id|    release_datetime|release_date|
+-----------+-------+--------------------+---------------+-----------+-------+-------+---------+--------------------+------------+
|       9760|     89|2023-07-24 10:15:...|             81| 2023-07-24|     89|Song_89|       33|2023-07-24 10:15:...|  2023-07-24|
|       9768|     89|2023-07-24 10:15:...|            295| 2023-07-24|     89|Song_89|       33|2023-07-24 10:15:...|  2023-07-24|
|       9799|     89|2023-07-24 10:15:...|            272| 2023-07-24|     89|Song_89|       33|2023-07-24 10:15:...|  2023-07-24|
|       7322|     64|2023-10-25 10:15:...|             95| 2023-10-25|     64|Song_64|       32|2023-10-25 10:15:...|  2023-10-25|
+-----------+-------+--------------------+---------------+-----------+-------+-----

                                                                                