In [54]:
# Installing required packages
!pip install pyspark
!pip install findspark
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [55]:
import findspark
findspark.init()

In [56]:
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [57]:
# Creating a spark context class
# sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [58]:
spark

In [59]:
music = pd.read_csv('music_transformed.csv')
sdf = spark.createDataFrame(music) 

In [60]:
sdf.printSchema()

root
 |-- Unnamed: 0: long (nullable = true)
 |-- Artist_Name: string (nullable = true)
 |-- Track_Name: string (nullable = true)
 |-- Popularity: double (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: long (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- Genre: double (nullable = true)
 |-- duration_in_min: double (nullable = true)



## b) Remove any songs that exceed 5 minutes

In [61]:
sdf1 = sdf.filter((sdf.duration_in_min < 5))
sdf1.show(10)

+----------+--------------------+--------------------+----------+------------+------+--------+----+-----------+------------+--------+-------+-------+--------------+-----+------------------+
|Unnamed: 0|         Artist_Name|          Track_Name|Popularity|danceability|energy|loudness|mode|speechiness|acousticness|liveness|valence|  tempo|time_signature|Genre|   duration_in_min|
+----------+--------------------+--------------------+----------+------------+------+--------+----+-----------+------------+--------+-------+-------+--------------+-----+------------------+
|         0|          Bruno Mars|That's What I Lik...|      60.0|       0.854| 0.564|  -4.964|   1|     0.0485|      0.0171|  0.0849|  0.899|134.071|           4.0|  5.0|3.9099333333333335|
|         1|              Boston|        Hitch a Ride|      54.0|       0.382| 0.814|   -7.23|   1|     0.0406|      0.0011|   0.101|  0.569|116.454|           4.0| 10.0|           4.19555|
|         2|       The Raincoats|  No Side to Fall

In [62]:
spark.catalog.dropTempView("music")
sdf1.createOrReplaceTempView("music")

## c) Display songs by J. Cole, Novo Amor and Anson Seabra

In [71]:
sdf2 = sdf1.filter((sdf.Artist_Name == 'J. Cole') | (sdf.Artist_Name == 'Novo Amor')| (sdf.Artist_Name == 'Anson Seabra'))
sdf2.show(10)

+----------+-----------+--------------------+----------+------------+------+--------+----+-----------+------------+--------+-------+-------+--------------+-----+------------------+
|Unnamed: 0|Artist_Name|          Track_Name|Popularity|danceability|energy|loudness|mode|speechiness|acousticness|liveness|valence|  tempo|time_signature|Genre|   duration_in_min|
+----------+-----------+--------------------+----------+------------+------+--------+----+-----------+------------+--------+-------+-------+--------------+-----+------------------+
|      1525|    J. Cole|p u n c h i n ‚Äò...|      81.0|       0.769|  0.74|  -6.579|   0|      0.378|       0.197|   0.171|  0.692| 94.104|           4.0|  5.0|1.8763166666666664|
|      2497|    J. Cole|p r i d e . i s ....|      85.0|       0.861| 0.656|  -7.867|   0|      0.425|       0.104|  0.0986|  0.331| 88.967|           4.0|  5.0| 3.636833333333333|
|      4051|    J. Cole|   i n t e r l u d e|      81.0|        0.77| 0.574|  -5.525|   1|     

## d) How many songs are included in every category?

In [64]:
category_counts = sdf1.groupby(['Genre'])\
.agg({"Genre": "count"})\
.show()

+-----+------------+
|Genre|count(Genre)|
+-----+------------+
|  8.0|         772|
|  0.0|         301|
|  7.0|         201|
|  1.0|         698|
|  4.0|         207|
|  3.0|         140|
|  2.0|         579|
| 10.0|        2235|
|  6.0|        1225|
|  5.0|         681|
|  9.0|        1079|
+-----+------------+



## e) Which artists dominated the charts?

In [65]:
category_counts = sdf1.groupby(['Artist_Name'])\
.agg({"Artist_Name": "count",'Popularity':"avg"})\
.sort("avg(Popularity)","count(Artist_Name)",ascending=False)\
.show()

+--------------------+------------------+---------------+
|         Artist_Name|count(Artist_Name)|avg(Popularity)|
+--------------------+------------------+---------------+
|       Doja Cat, SZA|                 1|           95.0|
|     Los Legendarios|                 1|           95.0|
|Justin Bieber, Da...|                 1|           95.0|
|The Weeknd, Arian...|                 1|           94.0|
|               Riton|                 1|           94.0|
|          The Weeknd|                 2|           93.5|
|Riton, Nightcrawl...|                 1|           92.0|
|          Tion Wayne|                 2|           90.0|
|             Cardi B|                 1|           90.0|
|                 SZA|                 2|           89.0|
|Galantis, David G...|                 1|           89.0|
|  Majestic, Boney M.|                 1|           89.0|
|       The Kid LAROI|                 2|           88.5|
|    Trinidad Cardona|                 1|           88.0|
|     ATB, Top

In [66]:
sdf1.groupby(['Artist_Name'])\
.agg({"Artist_Name": "count",'Popularity':"avg"})\
.sort("count(Artist_Name)","avg(Popularity)",ascending=False)\
.show()

+--------------------+------------------+------------------+
|         Artist_Name|count(Artist_Name)|   avg(Popularity)|
+--------------------+------------------+------------------+
|     Backstreet Boys|                35| 36.65714285714286|
|      Britney Spears|                27| 50.56902356902357|
|            Westlife|                26|7.6923076923076925|
|  The Rolling Stones|                22| 48.38257575757575|
|         The Beatles|                15| 68.11111111111111|
|             Nirvana|                14| 64.21428571428571|
|       Fleetwood Mac|                14|60.714285714285715|
|                  U2|                12|             58.75|
|     Lata Mangeshkar|                12|13.904761904761905|
|               AC/DC|                11| 67.81818181818181|
|      The Black Keys|                11|              57.0|
|      Arctic Monkeys|                10|              70.0|
|   Cage The Elephant|                10|              66.3|
|         The Killers|  

## f) What songs would be considered for the “Billboard Top 10 Songs of the Year”? (mention their artists as well)

In [69]:
sdf1.select("Track_Name","Artist_Name","Popularity").distinct().sort("Popularity","Track_Name",ascending=False)\
.show(10)

+--------------------+--------------------+----------+
|          Track_Name|         Artist_Name|Popularity|
+--------------------+--------------------+----------+
|            good 4 u|      Olivia Rodrigo|      99.0|
|Kiss Me More (fea...|            Doja Cat|      98.0|
|Astronaut In The ...|         Masked Wolf|      97.0|
|              Butter|                 BTS|      96.0|
|             deja vu|      Olivia Rodrigo|      95.0|
| Permission to Dance|                 BTS|      95.0|
|Peaches (feat. Da...|Justin Bieber, Da...|      95.0|
|Kiss Me More (fea...|       Doja Cat, SZA|      95.0|
|                Fiel|     Los Legendarios|      95.0|
|             traitor|      Olivia Rodrigo|      94.0|
+--------------------+--------------------+----------+
only showing top 10 rows



## g) Recommend at least 5 songs that can be played at a party

In [70]:
sdf1.select("Track_Name","Artist_Name","danceability","Popularity","Popularity","loudness")\
.distinct().sort("danceability","Popularity","energy","loudness",ascending=False)\
.show(10)

+--------------------+--------------------+------------+----------+----------+--------+
|          Track_Name|         Artist_Name|danceability|Popularity|Popularity|loudness|
+--------------------+--------------------+------------+----------+----------+--------+
|  Dancing in My Room|            347aidan|        0.98|      82.0|      82.0| -11.052|
|  Dancing in My Room|            347aidan|       0.979|       3.0|       3.0| -11.056|
|    OG Bobby Johnson|                QUE.|       0.974|      41.0|      41.0|  -3.879|
|        The Get Back|        Flipp Dinero|       0.968|      57.0|      57.0|  -6.674|
|SexyBack (feat. T...|   Justin Timberlake|       0.967|      80.0|      80.0|  -5.562|
|SexyBack (feat. T...|Justin Timberlake...|       0.967|      78.0|      78.0|  -5.562|
|Suspense Is Killi...|           Boy 8-Bit|       0.962|      19.0|      19.0| -11.178|
|          Step It Up|               Latto|       0.961|      59.0|      59.0|   -5.76|
|            Titanium|          