In [1]:
import math
import pandas as pd

SONGS = '/kaggle/input/most-streamed-spotify-songs-2024/Most Streamed Spotify Songs 2024.csv'

df = pd.read_csv(filepath_or_buffer=SONGS, encoding='iso8859_10', parse_dates=['Release Date'], thousands=',').drop(columns=['ISRC', 'TIDAL Popularity'])
df = df.dropna(subset=['Artist']).fillna(value=0)
df['log track score'] = df['Track Score'].apply(math.log)

df.head()

Unnamed: 0,Track,Album Name,Artist,Release Date,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,...,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,Explicit Track,log track score
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,2024-04-26,1,725.4,390470900.0,30716.0,196631588.0,92.0,...,684.0,62.0,17598718.0,114.0,18004655.0,22931.0,4818457.0,2669262.0,0,6.586723
1,Not Like Us,Not Like Us,Kendrick Lamar,2024-05-04,2,545.9,323703900.0,28113.0,174597137.0,92.0,...,3.0,67.0,10422430.0,111.0,7780028.0,28444.0,6623075.0,1118279.0,1,6.302436
2,i like the way you kiss me,I like the way you kiss me,Artemas,2024-03-19,3,538.4,601309300.0,54331.0,211607669.0,92.0,...,536.0,136.0,36321847.0,172.0,5022621.0,5639.0,7208651.0,5285340.0,0,6.288602
3,Flowers,Flowers - Single,Miley Cyrus,2023-01-12,4,444.9,2031281000.0,269802.0,136569078.0,85.0,...,2182.0,264.0,24684248.0,210.0,190260277.0,203384.0,0.0,11822942.0,0,6.09785
4,Houdini,Houdini,Eminem,2024-05-31,5,423.3,107034900.0,7223.0,151469874.0,88.0,...,1.0,82.0,17660624.0,105.0,4493884.0,7006.0,207179.0,457017.0,1,6.048081


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4595 entries, 0 to 4599
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Track                       4595 non-null   object        
 1   Album Name                  4595 non-null   object        
 2   Artist                      4595 non-null   object        
 3   Release Date                4595 non-null   datetime64[ns]
 4   All Time Rank               4595 non-null   int64         
 5   Track Score                 4595 non-null   float64       
 6   Spotify Streams             4595 non-null   float64       
 7   Spotify Playlist Count      4595 non-null   float64       
 8   Spotify Playlist Reach      4595 non-null   float64       
 9   Spotify Popularity          4595 non-null   float64       
 10  YouTube Views               4595 non-null   float64       
 11  YouTube Likes               4595 non-null   float64       
 1

Let's do the easy thing first and just look at top artists in this sample.

In [3]:
from plotly import express

TOP_N = 40
express.histogram(data_frame=df['Artist'].value_counts().to_frame().reset_index().head(n=TOP_N), x='Artist', y='count')

Also, what period of time does our sample cover?

In [4]:
from plotly import express

express.histogram(data_frame=df, x='Release Date')

From this plot we learn that our dataset has release date data, but not play date data. How do we interpret this graph? It looks like the data is dominated by tracks released this year.

Can we pick out artists by their mean track score?

In [5]:
from plotly import express

express.histogram(data_frame=df[['Artist', 'Track Score']].groupby(by='Artist').mean().sort_values(ascending=False, by='Track Score').reset_index().head(n=TOP_N), x='Artist', y='Track Score')

It's fascinating that mean track scores do not seem to bear any resemblance to prevalence.

In [6]:
from plotly import express

express.histogram(data_frame=df, x='Track Score')

Track scores look kind of Gaussian but with a really long right tail.

In [7]:
from plotly import express
express.histogram(data_frame=df, x='log track score')

The log of the track score looks more Gaussian, but not especially so.

What proportion of the tracks in our sample are tagged as explicit?

In [8]:
df['Explicit Track'].value_counts(normalize=True).to_dict()

{0: 0.64069640914037, 1: 0.35930359085963004}

35%, or a little more than a third.

Let's look at the relationship between the track score and the all time rank.

In [9]:
from plotly import express

express.scatter(data_frame=df, x='All Time Rank', y='Track Score')

Ah. The all time rank is based on the track score.

Let's use dimension reduction to look at all of the features that are floating-point values (except for the Track Score).

In [10]:
import arrow
from umap import UMAP

COLUMNS = [column for column, datatype in df.dtypes.to_dict().items() if str(datatype) == 'float64' and column != 'Track Score']

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-05 17:56:30.220001: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-05 17:56:30.220130: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-05 17:56:30.392702: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Fri Jul  5 17:56:42 2024 Construct fuzzy simplicial set
Fri Jul  5 17:56:42 2024 Finding Nearest Neighbors
Fri Jul  5 17:56:42 2024 Building RP forest with 8 trees
Fri Jul  5 17:56:49 2024 NN descent for 12 iterations
	 1  /  12
	 2  /  12
	 3  /  12
	Stopping threshold met -- exiting after 3 iterations
Fri Jul  5 17:57:09 2024 Finished Nearest Neighbor Search
Fri Jul  5 17:57:12 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Fri Jul  5 17:57:17 2024 Finished embedding
done with UMAP in 0:00:34.934326


In [11]:
from plotly import express

express.scatter(data_frame=df, x='x', y='y', color='log track score', height=800, hover_name='Track', hover_data=['Artist']).show()
express.scatter(data_frame=df, x='x', y='y', color='All Time Rank', height=800, hover_name='Track', hover_data=['Artist']).show()

It is not immediately obvious what values in our dataset are predictive of Track Score. All Time Rank is as we saw above just a ranking of Track Score.