In [32]:
!pip install kagglehub
!pip install plotly
!pip install yellowbrick



##### Standard Import

In [33]:
import os
import kagglehub
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from yellowbrick.target import FeatureCorrelation

import warnings
warnings.filterwarnings('ignore')

In [34]:
path = kagglehub.dataset_download("vatsalmavani/spotify-dataset")
print("Path to dataset files:", path)
files = os.listdir(path)
print("Files in dataset:", files)

Path to dataset files: /root/.cache/kagglehub/datasets/vatsalmavani/spotify-dataset/versions/1
Files in dataset: ['data']


In [35]:
data_path = os.path.join(path, 'data')
files = os.listdir(data_path)
print("Files in 'data' folder:", files)

Files in 'data' folder: ['data_by_artist.csv', 'data_by_genres.csv', 'data_w_genres.csv', 'data_by_year.csv', 'data.csv']


#### Read Data

In [36]:
data = pd.read_csv(os.path.join(data_path, 'data.csv'))
data_by_genres = pd.read_csv(os.path.join(data_path, 'data_by_genres.csv'))
data_by_artist = pd.read_csv(os.path.join(data_path, 'data_by_artist.csv'))
data_by_year = pd.read_csv(os.path.join(data_path, 'data_by_year.csv'))
data_w_genres = pd.read_csv(os.path.join(data_path, 'data_w_genres.csv'))

**Dimension Check**

In [37]:
data.shape

(170653, 19)

**Description:** The Spotify dataset, named data.csv, is an extensive collection containing information on over 170,000 tracks available on Spotify. The dataset spans an impressive range of nearly a century, covering tracks released between the years 1921 and 2020. It offers a rich resource for analyzing musical trends, evolution in genres, artists' contributions, and more.

In [38]:
data.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo'],
      dtype='object')

**Variables:**

id: A unique identifier for each track.

name: The track's title.

artists: Artist/s who participated in the track.

duration_ms: Length of the track in milliseconds (ms).


release_date: The track's release date in MM/DD/YYYY, or at the minimum, YYYY.

year: The year in which the track was released.


**acousticness**: A confidence measure from 0.0 to 1.0 of whether the track is acoustic.


**danceability**: Describes how suitable a track is for dancing. A value of 0.0 is least danceable and 1.0 is most danceable.


**energy**: Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast and loud.


**instrumentalness**: Predicts whether a track contains no vocals.

**liveness**: Detects the presence of an audience in the recording.


**loudness**: The overall loudness of a track in decibels (dB).

**speechiness**: Detects the presence of spoken words in a track.

**tempo**: The overall estimated tempo of a track in beats per minute (BPM).

**valence**: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track.

mode: Melodic content from the modality of a track. 1 = Major; 0 = Minor.

key: The estimated overall key of a track from the Pitch Class notation.

popularity: A track's popularity, 0-100 where 100 is the most popular based on
Spotify's algorithm.

explicit: Whether or not the track has explicit lyrics. 1 = True; 0 = False.

In [39]:
data

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.98200,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878000,10,0.6650,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.9630,1921,0.73200,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.000000,7,0.1600,-12.441,1,Clancy Lowered the Boom,5,1921,0.4150,60.936
2,0.0394,1921,0.96100,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913000,3,0.1010,-14.850,1,Gati Bali,5,1921,0.0339,110.339
3,0.1650,1921,0.96700,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,0.000028,5,0.3810,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.2530,1921,0.95700,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,0.000002,3,0.2290,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.0380,101.665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170648,0.6080,2020,0.08460,"['Anuel AA', 'Daddy Yankee', 'KAROL G', 'Ozuna...",0.786,301714,0.808,0,0KkIkfsLEJbrcIhYsCL7L5,0.000289,7,0.0822,-3.702,1,China,72,2020-05-29,0.0881,105.029
170649,0.7340,2020,0.20600,['Ashnikko'],0.717,150654,0.753,0,0OStKKAuXlxA0fMH54Qs6E,0.000000,7,0.1010,-6.020,1,Halloweenie III: Seven Days,68,2020-10-23,0.0605,137.936
170650,0.6370,2020,0.10100,['MAMAMOO'],0.634,211280,0.858,0,4BZXVFYCb76Q0Klojq4piV,0.000009,4,0.2580,-2.226,0,AYA,76,2020-11-03,0.0809,91.688
170651,0.1950,2020,0.00998,['Eminem'],0.671,337147,0.623,1,5SiZJoLXp3WOl3J4C8IK0d,0.000008,2,0.6430,-7.161,1,Darkness,70,2020-01-17,0.3080,75.055


In [40]:
pip install annoy



In [44]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.decomposition import TruncatedSVD
from annoy import AnnoyIndex


df = data

# 2. Preprocess Numerical Features
numerical_features = ['energy', 'instrumentalness', 'key', 'liveness', 'loudness',
                      'mode', 'popularity', 'speechiness', 'tempo']

# Standardize numerical features to avoid bias
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# 3. Encode Categorical Features
df['explicit'] = df['explicit'].astype(int)
df = pd.get_dummies(df, columns=['key', 'mode'], drop_first=True)

# 4. Handle 'release_date'
# Convert to datetime
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month
df['release_day'] = df['release_date'].dt.day
df['song_age'] = 2024 - df['release_year']

# Handle missing values
df['release_year'].fillna(df['release_year'].mean(), inplace=True)
df['release_month'].fillna(df['release_month'].mode()[0], inplace=True)
df['release_day'].fillna(df['release_day'].mode()[0], inplace=True)
df['song_age'].fillna(df['song_age'].mean(), inplace=True)

# Drop original 'release_date to avoid duplication'
df.drop(['release_date'], axis=1, inplace=True)

# 5. Text Feature Processing
# Combine 'name' and 'artists'
df['combined_text'] = df['name'] + " " + df['artists']

# Apply TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_text'])

# 6. Prepare Numerical Features for Sparse Matrix
feature_columns = df.drop(['id', 'name', 'artists', 'combined_text'], axis=1).columns

# Identify and drop any remaining non-numeric columns
non_numeric_cols = df[feature_columns].select_dtypes(include=['object', 'string']).columns.tolist()

if non_numeric_cols:
    print("Non-Numeric Columns Detected:", non_numeric_cols)
    # Decide whether to drop or encode them. Here, we'll drop them.
    df.drop(non_numeric_cols, axis=1, inplace=True)
    feature_columns = df.drop(['id', 'name', 'artists_song', 'combined_text'], axis=1).columns
    print(f"Dropped non-numeric columns: {non_numeric_cols}")
else:
    print("No Non-Numeric Columns Detected in Feature Columns.")

numerical_features_array = df[feature_columns].values.astype('float32')

# Convert to sparse matrix
numerical_sparse = csr_matrix(numerical_features_array)

print("Numerical Sparse Matrix Shape:", numerical_sparse.shape)
print("Numerical Sparse Matrix Type:", type(numerical_sparse))

# 7. Combine Numerical and TF-IDF Features
final_features = hstack([numerical_sparse, tfidf_matrix])

print("Final Features Shape:", final_features.shape)

# 8. Dimensionality Reduction with Truncated SVD
svd = TruncatedSVD(n_components=300, random_state=42)
reduced_features = svd.fit_transform(final_features)
print("Reduced Features Shape:", reduced_features.shape)

dim = reduced_features.shape[1]
annoy_index = AnnoyIndex(dim, 'angular')

for i in range(reduced_features.shape[0]):
    annoy_index.add_item(i, reduced_features[i])

annoy_index.build(10)
annoy_index.save('spotify_annoy_index.ann')
print("Annoy index built and saved successfully.")

# 10. Create Mapping from Song ID to Index
indices = pd.Series(df.index, index=df['id']).drop_duplicates()
print("Indices Mapping Created. Sample:")
print(indices.head())


def get_recommendations_annoy(song_id, top_n=10):
    if song_id not in indices:
        return "Song ID not found."
    idx = indices[song_id]
    similar_indices = annoy_index.get_nns_by_item(idx, top_n + 1)  # +1 to exclude itself
    similar_indices = [i for i in similar_indices if i != idx][:top_n]
    return df['id'].iloc[similar_indices].tolist()


recommended_song_ids = get_recommendations_annoy('some_song_id')
print("Recommended Song IDs:", recommended_song_ids)

def get_song_details(song_id):
    return df[df['id'] == song_id][['name', 'artists']].to_dict('records')

No Non-Numeric Columns Detected in Feature Columns.
Numerical Sparse Matrix Shape: (170653, 29)
Numerical Sparse Matrix Type: <class 'scipy.sparse._csr.csr_matrix'>
Final Features Shape: (170653, 529)
Reduced Features Shape: (170653, 300)
Annoy index built and saved successfully.
Indices Mapping Created. Sample:
id
4BJqT0PrAfrxzMOxytFOIz    0
7xPhfUan2yNtyFG0cUWkt8    1
1o6I8BglA6ylDMrIELygv1    2
3ftBPsC5vPBKxYSee08FDH    3
4d6HGyGT8e121BsdKmw9v6    4
dtype: int64
Recommended Song IDs: Song ID not found.


In [45]:
song_id = '4BJqT0PrAfrxzMOxytFOIz'
recommended_ids = get_recommendations_annoy(song_id)

print(f"Recommendations for {song_id}:")
for rid in recommended_ids:
    details = get_song_details(rid)
    print(details)

Recommendations for 4BJqT0PrAfrxzMOxytFOIz:
[{'name': 'Violin Concerto in D Minor, Op. 47: I. Allegro moderato', 'artists': "['Jean Sibelius', 'Jascha Heifetz', 'Walter Hendl']"}]
[{'name': 'Hella Often Freestyle', 'artists': "['Old Grape God']"}]
[{'name': 'Sinfonia Concertante for Violin, Viola and Orchestra in E-Flat Major, K. 364: I. Allegro maestoso', 'artists': "['Wolfgang Amadeus Mozart', 'Daniel Barenboim', 'English Chamber Orchestra']"}]
[{'name': 'Whole Lotta Love - (Live at MSG 1973) [Remaster]', 'artists': "['Led Zeppelin']"}]
[{'name': 'Lush Life - Rudy Van Gelder Remaster', 'artists': "['John Coltrane']"}]
[{'name': 'Do You Feel Like We Do', 'artists': "['Peter Frampton']"}]
[{'name': 'Symphony No. 5 in C-Sharp Minor: V. Rondo-Finale', 'artists': "['Gustav Mahler', 'Bruno Walter', 'New York Philharmonic']"}]
[{'name': 'Symphony No. 5 in C-Sharp Minor: V. Rondo-Finale', 'artists': "['Gustav Mahler', 'Bruno Walter', 'New York Philharmonic']"}]
[{'name': 'Violin Partita No. 