In [10]:
import findspark
findspark.init()

In [11]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQL-Transformations-Actions") \
    .master("spark://3efd074e93ff:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

In [12]:
import importlib
import team_name.spark_utils

importlib.reload(team_name.spark_utils)

columns_info = [ ("track_id", "string"),
                ("track_name", "string"),
                ("artist_id", "string"),
                ("artist_name", "string"),
                ("album_id", "string"),
                ("duration", "integer"),
                ("release_date", "timestamp"),
                ("popularity", "integer"),
                ("danceability", "double"),
                ("energy", "double"),
                ("key", "integer"),
                ("loudness", "double"),
                ("mode", "integer"),
                ("speechiness", "double"),
                ("acousticness", "double"),
                ("instrumentalness", "double"),
                ("liveness", "double"),
                ("valence", "double"),
                ("tempo", "double"),
                ("playlist_id", "string"),
                ("playlist_name", "string"),
                ("duration_mins", "double"),
                ("genre", "string")]

schema = team_name.spark_utils.SparkUtils.generate_schema(columns_info)

# Create DataFrame
tiktok_df = spark \
                .read \
                .schema(schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/tiktok.csv")

In [13]:
# Filter and Count Popular Tracks. Filter songs with a popularity score greater than 80 and count the number of such tracks.
filtered_duration_df = tiktok_df.filter(tiktok_df["popularity"] > 80)
selected_df = filtered_duration_df.select("popularity")
row_count = selected_df.count()
row_count

                                                                                

1023

In [14]:
# Calculate Average Duration of Songs by Genre. Group songs by genre and calculate the average duration mins for each genre.
avg_duration_by_genre = tiktok_df.groupBy("genre").avg("duration_mins")
avg_duration_by_genre.show()

+------------------+------------------+
|             genre|avg(duration_mins)|
+------------------+------------------+
|TIKTOK PHILIPPINES|3.2801328435737513|
|      TIKTOK DANCE| 3.015020713916861|
|           _TIKTOK| 3.251196442168827|
|        TIKTOK OPM| 4.257192861885788|
+------------------+------------------+



In [None]:
# Find the Top 5 Most Energetic Songs. Sort songs by energy in descending order and retrieve the top 5 songs.

filtered_energetic_df = tiktok_df.orderBy(tiktok_df["energy"].desc()).limit(5) # Transformation
selected_energetic = filtered_energetic_df.select("track_name", "energy") # Transformation selected
selected_energetic.show() # Action

+--------------------+------------------+
|          track_name|            energy|
+--------------------+------------------+
|       Kiat Jud Dong|0.9990000000000001|
|       Bukan untukku|             0.998|
|    Ritmo Envolvente|             0.995|
|Tante Culik Aku Dong|             0.995|
|Biarlah Semua Ber...|             0.995|
+--------------------+------------------+



In [23]:
### Calculate the Total Duration of Songs in Each Playlist

total_duration_by_playlist = tiktok_df.groupBy("playlist_name").sum("duration_mins")
total_duration_by_playlist.show()

+--------------------+------------------+
|       playlist_name|sum(duration_mins)|
+--------------------+------------------+
|5IZc3KIVFhjzJ0L2k...| 7.474666666666667|
|08ia51KbTcfs4QVT5...|            4.1485|
|7xVLFuuYdAvcTfcP3...| 9.456433333333333|
|2RBILNmyq8p4fqVWO...| 2.162933333333333|
|6GdDjthxbTGBV9rl2...|3.3209166666666667|
|7krYEnB1OI1RbnJBa...|2.0957666666666666|
|1FgPyHX7HruKDL4Tx...|            2.4448|
|62RtxFf9epYNWOUHJ...|2.6694333333333335|
|5ow0sNF1zSqp71Ix5...|2.7334833333333335|
|0LlJbV4lyzJYE14YC...|10.709133333333334|
|6NFKf8vBApSvtzkap...|3.7074333333333334|
|5P8lyudWE7HQxb4lu...| 4.250666666666667|
|2BgEsaKNfHUdlh97K...| 3.116433333333333|
|7F9vK8hNFMml4GtHs...| 3.173783333333333|
|4vVTI94F9uJ8lHNDW...|3.3657666666666666|
|2uULRpRtKhCdojXwo...|               2.2|
|1tRlGMHsf21FDo6pj...|           1.79005|
|215fAfwkWtlj30ofd...|2.3214166666666665|
|3bidbhpOYeV4knp8A...|            4.3057|
|0YFocHKmrMme7Isel...| 4.810383333333333|
+--------------------+------------

In [8]:
# Stop the SparkContext
sc.stop()