<a href="https://www.kaggle.com/code/mervetas/spotify-topsongs-trends?scriptVersionId=143856953" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Top Spotify Songs Exploratory Analysis

*by Merve H. Tas Bangert*


In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [None]:
# import data and visualise 

df = pd.read_csv('/kaggle/input/top-spotify-songs-2023/spotify-2023.csv', encoding='latin-1')
df.head()

## Check Top Data for Streamed Artists etc

In [None]:
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')
df['streams'] = df['streams'].replace([np.inf, -np.inf], np.nan)
artist_streams = df.groupby('artist(s)_name')['streams'].sum()
artist_streams = artist_streams.sort_values(ascending=False)

top_n = 10
top_artists = artist_streams.head(top_n)

plt.figure(figsize=(12, 6))
top_artists.plot(kind='bar', color='skyblue')
plt.xlabel('Artist')
plt.ylabel('Total Stream Count')
plt.title(f'Top {top_n} Most Streamed Artists')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()

In [None]:
plt_cols =  ['artist(s)_name', 'released_year', 'bpm']

def col_fname(column):
    words = column.split('_')
    formatted_words = [word.capitalize() for word in words]
    return ' '.join(formatted_words)

for i, col in enumerate(plt_cols):
    artist_counts = df[col].value_counts()
    top_n = artist_counts.head(10)
    col_name = col_fname(col)
    
    pastel_palette = sns.color_palette("pastel")

    plt.figure(figsize=(12, 6))
    sns.countplot(data=df, x=col, order=top_n.index, palette=pastel_palette)
    plt.xlabel(col_name)
    plt.ylabel('No. of Tracks')
    plt.title(f'Top 10 Most Frequent {col_name}')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()

    plt.show()

## Normalise streams to "popularity" and check whether any correlations with other features exist

In [None]:
min_value = df['streams'].min()
max_value = df['streams'].max()

df["popularity"] = (df['streams']-min_value)/(max_value-min_value)

In [None]:
df_features = df[['popularity', 'released_year', 'released_month', 'bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']]

cor_max = df_features.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(cor_max, annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f")
plt.title('Correlation Heatmap of Track Features and Popularity')
plt.show()

## Trends over time?

In [None]:
df_features_2 = df[[ 'bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%',
                   'instrumentalness_%', 'liveness_%', 'speechiness_%', 'released_year']]

yearly_mean = df_features_2.groupby('released_year').mean()
yearly_mean = yearly_mean.reset_index()


plt.figure(figsize=(20, 10))

for feature in ['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%',
                'instrumentalness_%', 'liveness_%', 'speechiness_%']:
    plt.plot(yearly_mean['released_year'], yearly_mean[feature], label=feature)


plt.xlabel('Released Year')
plt.ylabel('Mean Value')
plt.title('Trends Over Time')
plt.legend(loc='upper left')

plt.grid(True)
plt.show()


## What features have changed the most during the years?

In [None]:
yearly_std = df_features_2.groupby('released_year').std()
mean_std = yearly_std.mean()
mean_std_sorted = mean_std.sort_values(ascending=False)

print(mean_std_sorted)

In [None]:
plt.figure(figsize=(12, 6))
mean_std_sorted.plot(kind='bar', color='skyblue')
plt.xlabel('Features')
plt.ylabel('Mean Standard Deviation')
plt.title('Features with the Most Yearly Variation')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()

## Mean Feature Values of the Top 10 Most Popular Songs of Each Decade

In [None]:
# add decade column
df['decade'] = (df['released_year'] // 10) * 10

df_sorted = df.sort_values(by=['decade', 'popularity'], ascending=[True, False])
decade_groups = df_sorted.groupby('decade')

top_10_decade = pd.DataFrame()

for name, group in decade_groups:
    top_10_decade = pd.concat([top_10_decade, group.head(10)])

top_10_decade = top_10_decade.reset_index(drop=True)

# select columns of interest
cols_i = ['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%',
                        'instrumentalness_%', 'liveness_%', 'speechiness_%', 'decade']

df_selected = top_10_decade[cols_i]

# calculate the mean 
mean_feat_decade = df_selected.groupby('decade').mean()
mean_feat_decade = mean_feat_decade.reset_index()


mean_feat_long = mean_feat_decade.melt(id_vars=['decade'], var_name='Feature')

# plot

plt.figure(figsize=(25, 8))
sns.barplot(x='decade', y='value', hue='Feature', data=mean_feat_long, palette='Set3')


plt.xlabel('Decade')
plt.ylabel('Mean Feature Value')
plt.title('Mean Feature Values of Most Popular Tracks by Decade')
plt.legend(title='Feature', loc='upper left', bbox_to_anchor=(1, 1))
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()