In [2]:
# 01_data_exploration.ipynb

# 📌 Step 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set(style="whitegrid")

# 📌 Step 2: Load the Dataset
df = pd.read_csv('../data/spotify_dataset.csv')  # Adjust path if needed
print("✅ Dataset loaded successfully!")

# 📌 Step 3: Overview
print(f"\nDataset shape: {df.shape}")
print("\nColumns:")
print(df.columns)

print("\nSample rows:")
display(df.head())

# 📌 Step 4: Null / Missing Values
print("\nMissing values per column:")
print(df.isnull().sum())

# 📌 Step 5: Basic Descriptive Stats
print("\nBasic statistics for numeric columns:")
display(df.describe())

# 📌: Check Distribution of Key Features
numerical_cols = ['length', 'tempo', 'loudness']

for col in numerical_cols:
    plt.figure(figsize=(6, 3))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

# 📌 Step 7: ategorical Counts
categorical_cols = ['genre', 'emotion', 'key', 'time signature', 'explicit']

for col in categorical_cols:
    plt.figure(figsize=(8, 3))
    sns.countplot(x=col, data=df, order=df[col].value_counts().index)
    plt.title(f'Value Counts for {col}')
    plt.xticks(rotation=45)
    plt.show()

# 📌 Step 8: Popularity distribution and possible label creation
plt.figure(figsize=(7, 4))
sns.histplot(df['popularity'], bins=30, kde=True)
plt.title("Spotif Popularity Score Distribution")
plt.xlabel("Popularity")
plt.ylabel("Frequency")
plt.show()

# 📌 Step 9: Define Binary Label (Success vs. Not)
# Adjust threshold as needed
df['success'] = (df['popularity'] >= 60).astype(int)
print(f"\nSuccess label counts:\n{df['success'].value_counts()}")

# Save to new CSV for downstream work
df.to_csv('../data/songs_dataset_with_labels.csv', index=False)
print("\n💾 Dataset with 'success' label saved for further processing.")

✅ Dataset loaded successfully!

Dataset shape: (498052, 39)

Columns:
Index(['Artist(s)', 'song', 'text', 'Length', 'emotion', 'Genre', 'Album',
       'Release Date', 'Key', 'Tempo', 'Loudness (db)', 'Time signature',
       'Explicit', 'Popularity', 'Energy', 'Danceability', 'Positiveness',
       'Speechiness', 'Liveness', 'Acousticness', 'Instrumentalness',
       'Good for Party', 'Good for Work/Study',
       'Good for Relaxation/Meditation', 'Good for Exercise',
       'Good for Running', 'Good for Yoga/Stretching', 'Good for Driving',
       'Good for Social Gatherings', 'Good for Morning Routine',
       'Similar Artist 1', 'Similar Song 1', 'Similarity Score 1',
       'Similar Artist 2', 'Similar Song 2', 'Similarity Score 2',
       'Similar Artist 3', 'Similar Song 3', 'Similarity Score 3'],
      dtype='object')

Sample rows:


Unnamed: 0,Artist(s),song,text,Length,emotion,Genre,Album,Release Date,Key,Tempo,...,Good for Morning Routine,Similar Artist 1,Similar Song 1,Similarity Score 1,Similar Artist 2,Similar Song 2,Similarity Score 2,Similar Artist 3,Similar Song 3,Similarity Score 3
0,!!!,Even When the Waters Cold,Friends told her she was better off at the bot...,03:47,sadness,hip hop,Thr!!!er,2013-04-29,D min,0.43787,...,0,Corey Smith,If I Could Do It Again,0.986061,Toby Keith,Drinks After Work,0.983719,Space,Neighbourhood,0.983236
1,!!!,One Girl / One Boy,"Well I heard it, playing soft From a drunken b...",04:03,sadness,hip hop,Thr!!!er,2013-04-29,A# min,0.508876,...,0,Hiroyuki Sawano,BRE@TH//LESS,0.995409,When In Rome,Heaven Knows,0.990905,Justice Crew,Everybody,0.984483
2,!!!,Pardon My Freedom,"Oh my god, did I just say that out loud? Shoul...",05:51,joy,hip hop,Louden Up Now,2004-06-08,A Maj,0.532544,...,0,Ricky Dillard,More Abundantly Medley Live,0.993176,Juliet,Avalon,0.965147,The Jacksons,Lovely One,0.956752
3,!!!,Ooo,[Verse 1] Remember when I called you on the te...,03:44,joy,hip hop,As If,2015-10-16,A min,0.538462,...,0,Eric Clapton,Man Overboard,0.992749,Roxette,Don't Believe In Accidents,0.991494,Tiwa Savage,My Darlin,0.990381
4,!!!,Freedom 15,[Verse 1] Calling me like I got something to s...,06:00,joy,hip hop,As If,2015-10-16,F min,0.544379,...,0,Cibo Matto,Lint Of Love,0.98161,Barrington Levy,Better Than Gold,0.981524,Freestyle,Its Automatic,0.981415



Missing values per column:
Artist(s)                              0
song                                  14
text                                   0
Length                                 0
emotion                                0
Genre                                  0
Album                                 49
Release Date                      147683
Key                                    0
Tempo                                  0
Loudness (db)                          0
Time signature                         8
Explicit                               0
Popularity                             0
Energy                                 0
Danceability                           0
Positiveness                           0
Speechiness                            0
Liveness                               0
Acousticness                           0
Instrumentalness                       0
Good for Party                         0
Good for Work/Study                    0
Good for Relaxation/Meditatio

Unnamed: 0,Tempo,Loudness (db),Popularity,Energy,Danceability,Positiveness,Speechiness,Liveness,Acousticness,Instrumentalness,...,Good for Relaxation/Meditation,Good for Exercise,Good for Running,Good for Yoga/Stretching,Good for Driving,Good for Social Gatherings,Good for Morning Routine,Similarity Score 1,Similarity Score 2,Similarity Score 3
count,498052.0,498052.0,498052.0,498052.0,498052.0,498052.0,498052.0,498052.0,498052.0,498052.0,...,498052.0,498052.0,498052.0,498052.0,498052.0,498052.0,498052.0,498052.0,498052.0,498052.0
mean,0.531244,0.762177,30.486453,62.744027,58.285191,47.067467,11.397157,19.787725,26.056827,7.361777,...,0.031744,0.184005,0.053199,0.022034,0.054735,0.00929,0.063618,0.982887,0.977765,0.974715
std,0.173153,0.07297,17.189269,22.688164,17.35293,24.091678,12.28215,16.310829,29.618874,20.736406,...,0.175317,0.387489,0.224431,0.146794,0.227463,0.095937,0.244071,0.013297,0.014951,0.015961
min,0.0,0.0,0.0,0.0,6.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002656,0.002647,0.002647
25%,0.390533,0.728743,19.0,48.0,46.0,28.0,4.0,10.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.976614,0.970859,0.967244
50%,0.526627,0.777616,28.0,65.0,59.0,46.0,6.0,13.0,13.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.984911,0.980117,0.977254
75%,0.64497,0.81141,40.0,81.0,71.0,66.0,14.0,25.0,44.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.991553,0.987161,0.984789
max,1.0,1.0,100.0,100.0,99.0,100.0,97.0,100.0,100.0,100.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


KeyError: 'length'

<Figure size 600x300 with 0 Axes>