In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df = pd.read_csv('/kaggle/input/-spotify-tracks-dataset/dataset.csv', encoding='latin-1' )
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/-spotify-tracks-dataset/dataset.csv'

In [None]:
df.info()

In [None]:
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='popularity', bins=50)
plt.title('Distribution of Song Popularity')
plt.xlabel('Popularity Score')
plt.ylabel('Count')
plt.savefig('./figs/song-pop-dist.png') 

In [None]:
audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 
                 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
plt.figure(figsize=(12, 10))
sns.heatmap(df[audio_features].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Audio Features')
plt.savefig('./figs/corr-matrix-features.png') 

In [None]:
top_10_genres = df['track_genre'].value_counts().nlargest(10).index
plt.figure(figsize=(15, 8))
sns.boxplot(data=df[df['track_genre'].isin(top_10_genres)], 
            x='track_genre', y='popularity')
plt.xticks(rotation=45)
plt.title('Popularity Distribution by Genre')
plt.savefig('./figs/song-pop-dist-by-genre.png') 

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df[df['track_genre'].isin(top_10_genres)], 
                x='energy', y='danceability', hue='track_genre', alpha=0.6)
plt.title('Energy vs Danceability by Genre')
plt.savefig('./figs/genre-energy-vs-danceability.png') 

In [None]:
plt.figure(figsize=(12, 6))
top_50_songs = df.nlargest(100, 'popularity')
genre_counts = top_50_songs['track_genre'].value_counts()
sns.barplot(x=genre_counts.values, y=genre_counts.index)

plt.title('Genre Distribution in Top 100 Most Popular Songs', fontsize=12, pad=15)
plt.xlabel('Number of Songs', fontsize=10)
plt.ylabel('Genre', fontsize=10)
for i, v in enumerate(genre_counts.values):
    plt.text(v, i, f' {v}', va='center')

plt.show()
plt.savefig('./figs/top-100-genre.png') 

In [None]:
top_3_genres = df['track_genre'].value_counts().head(3).index
plt.figure(figsize=(10, 6))
genre_subset = df[df['track_genre'].isin(top_3_genres)]
sns.scatterplot(data=genre_subset, x='loudness', y='energy', hue='track_genre', alpha=0.6)
plt.title('Loudness vs Energy for Top 3 Genres')
plt.xlabel('Loudness')
plt.ylabel('Energy')
plt.savefig('./figs/top-3-loudness-vs-energy.png') 

In [None]:
plt.figure(figsize=(15, 8))
top_20_songs = df.nlargest(20, 'popularity')
top_20_songs['duration_min'] = top_20_songs['duration_ms'] / 60000

sns.barplot(x='duration_min', y='track_name', data=top_20_songs)
plt.title('Duration of Most Popular Songs')
plt.xlabel('Duration (minutes)')
plt.ylabel('Song Name')
plt.savefig('./figs/pop-duration-top-8.png') 

In [None]:
# mood classification
plt.figure(figsize=(12, 8))

top_5_genres = df['track_genre'].value_counts().head(5).index
df_filtered = df[df['track_genre'].isin(top_5_genres)]

df_sampled = df_filtered.groupby('track_genre').apply(lambda x: x.sample(min(len(x), 200))).reset_index(drop=True)

sns.scatterplot(data=df_sampled, 
                x='valence', 
                y='energy',
                hue='track_genre',
                style='track_genre',  # Add different markers for each genre
                s=100,               # Make points bigger
                alpha=0.6)          # Slightly transparent

plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.3)
plt.axvline(x=0.5, color='gray', linestyle='--', alpha=0.3)

plt.text(0.25, 0.85, 'Angry/Tense', 
         horizontalalignment='center', 
         bbox=dict(facecolor='white', alpha=0.7))
plt.text(0.75, 0.85, 'Happy/Euphoric', 
         horizontalalignment='center',
         bbox=dict(facecolor='white', alpha=0.7))
plt.text(0.25, 0.15, 'Sad/Depressing', 
         horizontalalignment='center',
         bbox=dict(facecolor='white', alpha=0.7))
plt.text(0.75, 0.15, 'Peaceful/Chill', 
         horizontalalignment='center',
         bbox=dict(facecolor='white', alpha=0.7))

plt.title('Mood Classification by Top 5 Genres\nValence vs Energy', pad=20, fontsize=14)
plt.xlabel('Valence (Musical Positiveness)', fontsize=12)
plt.ylabel('Energy', fontsize=12)

plt.legend(title="Genre", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()
plt.savefig('./figs/mood-classification-top-5-genres.png') 

In [None]:
most_common_genre = df['track_genre'].value_counts().index[0]
genre_df = df[df['track_genre'] == most_common_genre].copy()

popular_songs = genre_df.nlargest(100, 'popularity')
unpopular_songs = genre_df.nsmallest(100, 'popularity')

plt.figure(figsize=(10, 8))
plt.scatter(popular_songs['energy'], popular_songs['danceability'], 
          alpha=0.6, label='Popular', color='green')
plt.scatter(unpopular_songs['energy'], unpopular_songs['danceability'], 
          alpha=0.6, label='Unpopular', color='red')

plt.xlabel('Energy')
plt.ylabel('Danceability')
plt.title(f'Popular vs Unpopular Songs in {most_common_genre}')
plt.legend()
plt.tight_layout()
plt.show()
plt.savefig('./figs/pop-unpopular-acoustic-same-features.png') 

In [None]:
popular_artist = df['artists'].value_counts().index[0]
artist_songs = df[df['artists'] == popular_artist]

plt.figure(figsize=(10, 6))
features = ['danceability', 'energy', 'valence', 'acousticness']
sns.boxplot(data=artist_songs[features])
plt.title(f'Audio Feature Variation for {popular_artist}')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.savefig('./figs/same-artist-variation.png') 

In [None]:

features = ['danceability', 'energy', 'valence', 'acousticness', 'instrumentalness', 'liveness']
sample_songs = df.sample(5)

sample_songs['id'] = range(len(sample_songs))
plt.figure(figsize=(12, 6))
parallel_coordinates(sample_songs[features + ['id']], 'id', colormap=plt.cm.Set3)
plt.title('Parallel Coordinates Plot of Song Features')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.savefig('parallel-feature-plot.png') 