## Package Installation

In [1]:
!pip install spotipy
!pip install kaggle



In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


## Standard Import

In [3]:
import warnings
warnings.filterwarnings('ignore')

import os
import kagglehub
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import yaml
import json
import re
import sys
import itertools

from nltk.corpus import stopwords
from scipy.sparse import csr_matrix, vstack
from scipy.sparse.linalg import svds

from sklearn.feature_selection import RFECV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score, precision_score, recall_score, silhouette_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from xgboost import XGBClassifier

import spotipy
from spotipy.oauth2 import SpotifyOAuth
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

## Find the correct file path with the datasets and their names

In [4]:
# Download latest version
path = kagglehub.dataset_download("vatsalmavani/spotify-dataset")

print("Path to dataset files:", path)

# List all files in the dataset directory
files = os.listdir(path)
print("Files in dataset:", files)

# Adjust path to point to the 'data' folder
data_path = os.path.join(path, 'data')

# List files in the 'data' folder
files = os.listdir(data_path)
print("Files in 'data' folder:", files)

Path to dataset files: /Users/zhiyuanchen/.cache/kagglehub/datasets/vatsalmavani/spotify-dataset/versions/1
Files in dataset: ['data']
Files in 'data' folder: ['data_by_genres.csv', 'data.csv', 'data_by_artist.csv', 'data_by_year.csv', 'data_w_genres.csv']


## Read Data

In [5]:
data = pd.read_csv(os.path.join(data_path, 'data.csv'))
data_by_genres = pd.read_csv(os.path.join(data_path, 'data_by_genres.csv'))
data_by_artist = pd.read_csv(os.path.join(data_path, 'data_by_artist.csv'))
data_by_year = pd.read_csv(os.path.join(data_path, 'data_by_year.csv'))
data_w_genres = pd.read_csv(os.path.join(data_path, 'data_w_genres.csv'))

## data.csv

### EDA and (Some) Data Preprocessing

In [6]:
 data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [7]:
data.shape

(170653, 19)

In [8]:
data.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo'],
      dtype='object')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

In [10]:
data.describe()

Unnamed: 0,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
count,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0
mean,0.528587,1976.787241,0.502115,0.537396,230948.3,0.482389,0.084575,0.16701,5.199844,0.205839,-11.46799,0.706902,31.431794,0.098393,116.86159
std,0.263171,25.917853,0.376032,0.176138,126118.4,0.267646,0.278249,0.313475,3.515094,0.174805,5.697943,0.455184,21.826615,0.16274,30.708533
min,0.0,1921.0,0.0,0.0,5108.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0
25%,0.317,1956.0,0.102,0.415,169827.0,0.255,0.0,0.0,2.0,0.0988,-14.615,0.0,11.0,0.0349,93.421
50%,0.54,1977.0,0.516,0.548,207467.0,0.471,0.0,0.000216,5.0,0.136,-10.58,1.0,33.0,0.045,114.729
75%,0.747,1999.0,0.893,0.668,262400.0,0.703,0.0,0.102,8.0,0.261,-7.183,1.0,48.0,0.0756,135.537
max,1.0,2020.0,0.996,0.988,5403500.0,1.0,1.0,1.0,11.0,1.0,3.855,1.0,100.0,0.97,243.507


In [11]:
df = data.copy()

In [12]:
df['popularity'] = df['popularity'] / 100  # normalize popularity feature between 0 and 1

# Convert duration from milliseconds to seconds
df['duration_s'] = df['duration_ms'] / 1000

df = df.drop(['duration_ms'], axis=1)

# # Apply log transformation (log1p to handle zero or near-zero values)
# df['log_duration'] = np.log1p(df['duration_s'])

# # Plotting the original and log-transformed duration_s for comparison
# plt.figure(figsize=(14, 6))

# # Original duration_s distribution
# plt.subplot(1, 2, 1)
# sns.histplot(df['duration_s'], kde=True, color="skyblue")
# plt.title("Original Duration Distribution (s)")
# plt.xlabel("Duration (s)")

# df = df.drop(['duration_ms', 'duration_s'], axis=1)

# # Log-transformed duration_s distribution
# plt.subplot(1, 2, 2)
# sns.histplot(df['log_duration'], kde=True, color="salmon")
# plt.title("Log-Transformed Duration Distribution")
# plt.xlabel("Log(Duration)")

# plt.tight_layout()
# plt.show()

In [13]:
df.describe()

Unnamed: 0,valence,year,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,duration_s
count,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0
mean,0.528587,1976.787241,0.502115,0.537396,0.482389,0.084575,0.16701,5.199844,0.205839,-11.46799,0.706902,0.314318,0.098393,116.86159,230.948311
std,0.263171,25.917853,0.376032,0.176138,0.267646,0.278249,0.313475,3.515094,0.174805,5.697943,0.455184,0.218266,0.16274,30.708533,126.118415
min,0.0,1921.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,5.108
25%,0.317,1956.0,0.102,0.415,0.255,0.0,0.0,2.0,0.0988,-14.615,0.0,0.11,0.0349,93.421,169.827
50%,0.54,1977.0,0.516,0.548,0.471,0.0,0.000216,5.0,0.136,-10.58,1.0,0.33,0.045,114.729,207.467
75%,0.747,1999.0,0.893,0.668,0.703,0.0,0.102,8.0,0.261,-7.183,1.0,0.48,0.0756,135.537,262.4
max,1.0,2020.0,0.996,0.988,1.0,1.0,1.0,11.0,1.0,3.855,1.0,1.0,0.97,243.507,5403.5


In [14]:
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
print(numerical_features)

categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
print(categorical_columns)

['valence', 'year', 'acousticness', 'danceability', 'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo', 'duration_s']
['artists', 'id', 'name', 'release_date']


### Correlation

In [15]:
# data2 = data.copy()

# # Drop the target variable from the features
# X = data2.drop(columns=['popularity', 'artists', 'id', 'name', 'release_date'])

# # Target variable
# y = data2['popularity']

# # Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# reg = LinearRegression()
# reg.fit(X_train, y_train)

# # reg.fit_transform(X_test, y_test)

# y_pred = reg.predict(X_test)

# # Calculate accuracy
# print("MSE", mean_squared_error(y_test, y_pred))

In [16]:
# correlation_matrix = df[numerical_features].corr()

# plt.figure(figsize=(10, 10))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
# plt.tight_layout()
# plt.show()

In [17]:
# filtered_corr = correlation_matrix[(abs(correlation_matrix) > 0.5) & (correlation_matrix != 1.0)]

# plt.figure(figsize=(10, 8))
# sns.heatmap(filtered_corr, annot=True, cmap='coolwarm', mask=filtered_corr.isnull(), vmin=0.5, vmax=1)
# plt.title('Correlations Greater than 0.5')
# plt.show()

In [18]:
# # Summary statistics: mean, median, std, min, and max
# summary_statistics = df[numerical_features].describe().T[['mean', '50%', 'std', 'min', 'max']]
# summary_statistics.rename(columns={'50%': 'median'}, inplace=True)

# plt.figure(figsize=(12, 12))
# for i, feature in enumerate(numerical_features, 1):
#     plt.subplot(4, 4, i)
#     sns.histplot(df[feature], kde=True)
#     plt.title(f"Distribution of {feature}")
#     plt.xlabel(feature)
#     plt.ylabel("Frequency")

# plt.tight_layout()
# plt.show()

In [19]:
# minmax_variables = ['valence', 'acousticness', 'danceability', 'energy', 'liveness', 'speechiness', 'instrumentalness']

# standard_variables = [ 'key', 'liveness', 'loudness', 'popularity', 'tempo']

### Linear Regression

In [20]:
# # Drop the target variable from the features
# X = df.drop(columns=['popularity', 'artists', 'id', 'name', 'release_date'])

# # Target variable
# y = df['popularity']

# # Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define preprocessing steps
# numeric_features = X.select_dtypes(include=['float64', 'int64']).columns

# numeric_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_features)
#     ])

# # Append classifier to preprocessing pipeline
# clf = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', LinearRegression())])

# # Fit the model
# clf.fit(X_train, y_train)

# # Predict on the test set
# y_pred = clf.predict(X_test)

# # Calculate accuracy
# print("MSE", mean_squared_error(y_test, y_pred))

In [21]:
# # Linear Regression Pipeline
# # Drop the target variable from the features
# X = df.drop(columns=['popularity', 'artists', 'id', 'name', 'release_date'])

# # Target variable
# y = df['popularity']

# # Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize a scaler for linear regression
# scaler = StandardScaler()

# # Scale the training and test data
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Train the linear regression model
# reg = LinearRegression()
# reg.fit(X_train_scaled, y_train)

# # Predict the target variable for the test set
# y_pred = reg.predict(X_test_scaled)

# # Calculate accuracy (Mean Squared Error in this case)
# print("MSE:", mean_squared_error(y_test, y_pred))

### K-Means Clustering

In [94]:
# Clustering and Elbow Method
# Select numerical columns for clustering
X = df[numerical_features].drop('popularity', axis=1)
cols = X.columns
X_scaled = MinMaxScaler().fit_transform(X)

inertia_score=[]
for k in range(2,201):
  kmeans = KMeans(n_clusters=k, random_state=42)
  kmeans.fit(X_scaled)
  # labels = kmeans.labels_
  # centroids = kmeans.cluster_centers_
  inertia = kmeans.inertia_
  inertia_score.append(inertia)

plt.plot(list(range(2,201)), inertia_score, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.show()

KeyboardInterrupt: 

In [23]:
best_k = 2 + inertia_score.index(min(inertia_score))
best_k

20

### KNN

In [24]:
# Best number of clusters (from your silhouette score optimization)
# best_k = best_params['n_clusters']

# Train KMeans with the optimal number of clusters
kmeans = KMeans(n_clusters=best_k)
df['cluster'] = kmeans.fit_predict(X_scaled)

In [25]:
df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,duration_s,cluster
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",0.04,1921,0.0366,80.954,831.667,9
1,0.963,1921,0.732,['Dennis Day'],0.819,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,0.05,1921,0.415,60.936,180.533,17
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,0.05,1921,0.0339,110.339,500.062,9
3,0.165,1921,0.967,['Frank Parker'],0.275,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,0.03,1921,0.0354,100.109,210.0,7
4,0.253,1921,0.957,['Phil Regan'],0.418,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,0.02,1921,0.038,101.665,166.693,7


In [92]:
df[df['name'] == "單車"] 

Unnamed: 0,valence,year,acousticness,artists,danceability,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,duration_s,cluster
104898,0.307,2001,0.754,['Eason Chan'],0.635,0.35,0,2AllsVsmrJkEwXPXwDBCQu,0.0,7,0.0743,-10.591,1,單車,0.54,2001-03-28,0.0284,130.067,211.251,11


In [26]:
# Function to recommend K nearest songs
def recommend_songs(song_id, K=5):

    song_index = df[df['id']==song_id].index[0]

    # Get the cluster of the input song
    song_cluster = df.loc[song_index, 'cluster']

    # Filter songs in the same cluster
    cluster_data = df[df['cluster'] == song_cluster]

    # Get features of the cluster
    cluster_features = X_scaled[df['cluster'] == song_cluster]

    # Train KNN on the cluster
    knn = NearestNeighbors(n_neighbors= K + 1, metric='euclidean')
    knn.fit(cluster_features)

    # Find the index of the input song within the cluster
    input_song_features = X_scaled[song_index].reshape(1, -1)
    distances, indices = knn.kneighbors(input_song_features)

    # Remove the input song from the results and get the recommended song indices
    recommended_indices = indices[0][1:]

    # Get the song details for the recommendations
    recommendations = cluster_data.iloc[recommended_indices]
    return recommendations

In [88]:
recommend_songs('5YUyW9opqNsMSEzzecZih1') # Caramelo Remix by Ozuna, Karol G, and Myke Towers

Unnamed: 0,valence,year,acousticness,artists,danceability,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,duration_s,cluster
106572,0.607,2009,0.0458,['Shiny Toy Guns'],0.637,0.79,0,6MQdIVpcXuaa9QpuCd33ei,0.0041,10,0.249,-6.776,0,Major Tom,0.44,2009-01-01,0.0383,164.822,262.333,15
56062,0.637,2014,0.021,['Sam Hunt'],0.615,0.824,0,0YQcktqICwSWzlS4P4jzOz,0.0,10,0.342,-4.903,0,Ex To See,0.57,2014-10-27,0.107,149.986,197.813,15
140470,0.68,2019,0.054,"['HIXTAPE', 'HARDY', 'Morgan Wallen']",0.563,0.806,0,65IHmBagtQrYRcJd8a47zv,8e-06,9,0.204,-5.11,0,He Went To Jared (HARDY feat. Morgan Wallen),0.6,2019-09-13,0.0474,140.075,213.693,15
57369,0.717,2020,0.0215,"['K/DA', '(G)I-DLE', 'Wolftyla', 'Bea Miller',...",0.676,0.857,0,2V4Fx72svQRxrFvNT1eq5f,0.0,10,0.254,-4.368,0,THE BADDEST,0.81,2020-08-27,0.0438,149.981,162.6,15
124884,0.589,2020,0.0145,['Dermot Kennedy'],0.579,0.839,0,0l4AReW2LuX0yStPfgQgSR,3e-05,8,0.201,-5.409,0,Giants,0.81,2020-06-24,0.0703,158.141,177.173,15


## data_by_genres.csv

In [28]:
data_by_genres.head()

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333,6
1,1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5,5
2,1,8-bit,0.762,0.712,115177.0,0.818,0.876,0.126,-9.18,0.047,133.444,0.975,48.0,7
3,1,[],0.651417,0.529093,232880.9,0.419146,0.205309,0.218696,-12.288965,0.107872,112.857352,0.513604,20.859882,7
4,1,a cappella,0.676557,0.538961,190628.5,0.316434,0.003003,0.172254,-12.479387,0.082851,112.110362,0.448249,45.820071,7


In [29]:
data_by_genres.shape

(2973, 14)

In [30]:
data_by_genres['genres'].nunique()

2973

In [31]:
data_by_genres[data_by_genres['genres']!='[]']

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,21st century classical,0.979333,0.162883,1.602977e+05,0.071317,0.606834,0.361600,-31.514333,0.040567,75.336500,0.103783,27.833333,6
1,1,432hz,0.494780,0.299333,1.048887e+06,0.450678,0.477762,0.131000,-16.854000,0.076817,120.285667,0.221750,52.500000,5
2,1,8-bit,0.762000,0.712000,1.151770e+05,0.818000,0.876000,0.126000,-9.180000,0.047000,133.444000,0.975000,48.000000,7
4,1,a cappella,0.676557,0.538961,1.906285e+05,0.316434,0.003003,0.172254,-12.479387,0.082851,112.110362,0.448249,45.820071,7
5,1,abstract,0.459210,0.516167,3.431965e+05,0.442417,0.849667,0.118067,-15.472083,0.046517,127.885750,0.307325,43.500000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2968,1,zolo,0.222625,0.547082,2.580991e+05,0.610240,0.143872,0.204206,-11.295878,0.061088,125.494919,0.596155,33.778943,9
2969,0,zouglou,0.161000,0.863000,2.063200e+05,0.909000,0.000000,0.108000,-5.985000,0.081300,119.038000,0.845000,58.000000,7
2970,1,zouk,0.263261,0.748889,3.060728e+05,0.622444,0.257227,0.089678,-10.289222,0.038778,101.965222,0.824111,46.666667,5
2971,0,zurich indie,0.993000,0.705667,1.984173e+05,0.172667,0.468633,0.179667,-11.453333,0.348667,91.278000,0.739000,0.000000,7


## data_by_artist.csv

In [32]:
data_by_artist.head()

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,9,0.590111,"""Cats"" 1981 Original London Cast",0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5
1,1,26,0.862538,"""Cats"" 1983 Broadway Cast",0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5
2,1,7,0.856571,"""Fiddler On The Roof” Motion Picture Chorus",0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0
3,1,27,0.884926,"""Fiddler On The Roof” Motion Picture Orchestra",0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0
4,1,7,0.510714,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5


In [33]:
data_by_artist.shape

(28680, 15)

## data_by_year.csv

In [34]:
data_by_year.head()

Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,1921,0.886896,0.418597,260537.166667,0.231815,0.344878,0.20571,-17.048667,0.073662,101.531493,0.379327,0.653333,2
1,1,1922,0.938592,0.482042,165469.746479,0.237815,0.434195,0.24072,-19.275282,0.116655,100.884521,0.535549,0.140845,10
2,1,1923,0.957247,0.577341,177942.362162,0.262406,0.371733,0.227462,-14.129211,0.093949,114.01073,0.625492,5.389189,0
3,1,1924,0.9402,0.549894,191046.707627,0.344347,0.581701,0.235219,-14.231343,0.092089,120.689572,0.663725,0.661017,10
4,1,1925,0.962607,0.573863,184986.92446,0.278594,0.418297,0.237668,-14.146414,0.111918,115.521921,0.621929,2.604317,5


In [35]:
data_by_year.shape

(100, 14)

## data_w_genres.csv

In [36]:
data_w_genres.head()

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],"""Cats"" 1981 Original London Cast",0.590111,0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5,1,9
1,[],"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5,1,26
2,[],"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0,1,7
3,[],"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0,1,27
4,[],"""Joseph And The Amazing Technicolor Dreamcoat""...",0.510714,0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5,1,7


In [37]:
data_w_genres.shape

(28680, 16)

In [38]:
data_w_genres[data_w_genres['genres']!='[]']

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],"""Cats"" 1981 Original London Cast",0.590111,0.467222,250318.555556,0.394003,0.011400,0.290833,-14.448000,0.210389,117.518111,0.389500,38.333333,5,1,9
8,"['comedy rock', 'comic', 'parody']","""Weird Al"" Yankovic",0.173145,0.662787,218948.196721,0.695393,0.000050,0.161102,-9.768705,0.084536,133.031180,0.751344,34.229508,9,1,122
9,"['emo rap', 'florida rap', 'sad rap', 'undergr...",$NOT,0.544467,0.789800,137910.466667,0.532933,0.023063,0.180300,-9.149267,0.293687,112.344800,0.480700,67.533333,1,1,15
10,"['dark trap', 'meme rap']",$atori Zoom,0.239000,0.883000,141519.000000,0.625000,0.000000,0.076500,-4.098000,0.245000,126.677000,0.871000,67.000000,6,1,2
12,"['asian american hip hop', 'cali rap', 'west c...",$tupid Young,0.148100,0.854000,190572.000000,0.683000,0.000002,0.188500,-6.997000,0.221000,100.724500,0.625500,57.500000,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28667,['classic cantopop'],陳麗斯,0.562000,0.550000,195013.000000,0.375000,0.000004,0.244000,-12.277000,0.033200,75.703000,0.405000,33.000000,0,1,2
28673,"['c-pop', 'classic mandopop', 'vintage chinese...",鳳飛飛,0.884000,0.358000,259387.000000,0.208000,0.000002,0.150000,-9.524000,0.033900,131.261000,0.278000,43.000000,10,1,2
28676,"['c-pop', 'classic cantopop', 'classic mandopo...",黃品源,0.541000,0.578000,293840.000000,0.334000,0.000006,0.067500,-11.974000,0.026700,135.934000,0.243000,48.000000,9,0,2
28678,"['chinese indie', 'chinese indie rock']",黑豹,0.381000,0.353000,316160.000000,0.686000,0.000000,0.056800,-9.103000,0.039500,200.341000,0.352000,35.000000,11,1,2


In [39]:
data_w_genres['genres'].nunique()

10743

## Spotipy

In [40]:
client_id = "6448587d422647a8887bfba66e70067f" # Replace
client_secret = "62f6ce053995469c88d787b84eaf4172" # Replace
redirect_uri = "http://localhost:8881" # Replace
scope = "user-library-read playlist-modify-public"

# Use SpotifyOAuth for user authorization
auth_manager = SpotifyOAuth(client_id=client_id,
                            client_secret=client_secret,
                            redirect_uri=redirect_uri,
                            scope=scope
                            )

sp = spotipy.Spotify(auth_manager=auth_manager)

# # Initialize client credentials manager
# client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)

# # Pass the credentials manager to the Spotify client
# sp = spotipy.Spotify(auth_manager=client_credentials_manager)

In [41]:
def get_track_id(song_name, artist_name):
    query = f"track:{song_name} artist:{artist_name}"
    results = sp.search(q=query, type="track", limit=1)
    if results['tracks']['items']:
        track_id = results['tracks']['items'][0]['id']
        return track_id
    else:
        print("Track not found.")
        return None

# Example usage
song_name = "Home"
artist_name = "ENHYPEN"

query = f"track:{song_name} artist:{artist_name}"
results = sp.search(q=query, type="track", limit=1)
results

track_id = get_track_id(song_name, artist_name)
print(f"Track ID for '{song_name}' by {artist_name}: {track_id}")

Track ID for 'Home' by ENHYPEN: 7yTn8YwfpEimufBe7uDtO7


In [44]:
# Fetch user's playlists
playlists = sp.current_user_playlists(limit=10)

# Extract tracks from playlists
for playlist in playlists['items']:
    print(f"Playlist: {playlist['name']}, Total Tracks: {playlist['tracks']['total']}")
    playlist_tracks = sp.playlist_tracks(playlist['id'])
    for item in playlist_tracks['items']:
        track = item['track']
        print(f" - Track: {track['name']} by {track['artists'][0]['name']}")

Playlist: 🇯🇵, Total Tracks: 33
 - Track: 空も飛べるはず by SPITZ
 - Track: ロビンソン by SPITZ
 - Track: 名もなき詩 by Mr.Children
 - Track: Tomorrow never knows by Mr.Children
 - Track: 異邦人 by Saki Kubota
 - Track: Akagi blues by Noboru Kirishima
 - Track: 少女A - 2012 Remaster by Akina Nakamori
 - Track: 十戒(1984) by Akina Nakamori
 - Track: 青い珊瑚礁 by Seiko Matsuda
 - Track: DAN DAN 心魅かれてく by ZARD
 - Track: 負けないで by ZARD
 - Track: 揺れる想い by ZARD
 - Track: マイ フレンド by ZARD
 - Track: 夜に駆ける by YOASOBI
 - Track: 群青 by YOASOBI
 - Track: 愛にできることはまだあるかい by RADWIMPS
 - Track: One more time，One more chance by Masayoshi Yamazaki
 - Track: ラブ・ストーリーは突然に by Kazumasa Oda
 - Track: キラキラ by Kazumasa Oda
 - Track: 伝えたいことがあるんだ by Kazumasa Oda
 - Track: サボテンの花 by Kazuo Zaitsu
 - Track: 青春の影 by Kazuo Zaitsu
 - Track: 幸せな結末 by 大滝詠一
 - Track: Plastic Love by Mariya Takeuchi
 - Track: Mayonaka no Door / Stay With Me by Miki Matsubara
 - Track: 愛は勝つ by KAN
 - Track: 今夜月の見える丘に (Alternative Guitar Solo ver.) by B'z
 - Track: ら･ら･ら 

In [93]:
track_ids = ['2AllsVsmrJkEwXPXwDBCQu'] # Add more track IDs
features_list = []

for track_id in track_ids:
    features = sp.audio_features(track_id)[0]
    features_list.append(features)

# Convert to DataFrame
df_features = pd.DataFrame(features_list)
print(df_features.head())

HTTP Error for GET to https://api.spotify.com/v1/audio-features/?ids=2AllsVsmrJkEwXPXwDBCQu with Params: {} returned 403 due to None


SpotifyException: http status: 403, code:-1 - https://api.spotify.com/v1/audio-features/?ids=2AllsVsmrJkEwXPXwDBCQu:
 None, reason: None