## Exploring Spotify music taste profiles using Machine Learning

In [4]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.preprocessing import StandardScaler
from math import pi
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points


In [7]:
#Load songdata

#top_annual_df = pd.read_csv("./top_tracks_final.csv")
top_tracks_USA = pd.read_csv("./top_USA_final.csv")
songs_j_loves = pd.read_csv("./songs_j_loves_final.csv")
songs_b_loves = pd.read_csv("./songs_b_loves_final.csv")
songs_k_loves = pd.read_csv("./spotify_kaki_favs_merged.csv")
songs_a_loves = pd.read_csv("./spotify_ali_favs_merged.csv")


FileNotFoundError: [Errno 2] No such file or directory: '/Desktop/D_Sci/NYDSA_Bootcamp/capstone_spotify/top_USA_final.csv'

In [None]:
songs_j_loves.columns

In [None]:
songs_b_loves.columns

In [None]:
songs_k_loves.columns

In [None]:
songs_a_loves.columns

In [None]:
# rename columns so that they match across dataframes
songs_b_loves.rename(columns = {"track_name":"name", "track_id":"id"}, inplace = True)

# drop index column
songs_k_loves = songs_k_loves.drop(columns=["index_col"])
songs_b_loves = songs_b_loves.drop(columns = ["index_col"])
songs_a_loves = songs_a_loves.drop(columns = ["index_col"])

In [None]:
# reorder column names
columns= ['id', 'name', 'artist', 'popularity', 'duration', 'danceability', 
          'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
          'instrumentalness', 'liveness', 'valence', 'tempo', 'release_date', 'release_year','album']

songs_k_loves = songs_k_loves.reindex(columns, axis = "columns")
songs_b_loves = songs_b_loves.reindex(columns, axis = "columns")
songs_j_loves = songs_j_loves.reindex(columns, axis = "columns")



In [None]:
# store df column names in 3 lists so that we can compare their values
column_names_b = songs_b_loves.columns.values.tolist()
column_names_j = songs_j_loves.columns.values.tolist()
column_names_k = songs_k_loves.columns.values.tolist()
column_names_a = songs_a_loves.columns.values.tolist()


In [None]:
print(column_names_b)

In [None]:
print(column_names_k)

In [None]:
#confirm column names in both dataframes are identical
column_names_b == column_names_k == column_names_j == column_names_a

In [None]:
# Visualize distributions of liked tracks
plt.figure(1, figsize=(20, 10),)
# Adjust the subplot layout parameters
plt.subplots_adjust(hspace=0.5, wspace=1.0)

plt.subplot(331)
sns.distplot(songs_j_loves.tempo);
plt.text(150, .010, r'$\mu=\ 120.61$', fontsize=18)
plt.xlabel('Tempo (BMP)', fontsize=18)
plt.grid(True)

plt.subplot(332)
sns.distplot(songs_j_loves.popularity);
plt.text(70, .020, r'$\mu=\ 52.62$', fontsize=18)
plt.xlabel('Popularity', fontsize=18)
plt.grid(True)

plt.subplot(333)
sns.distplot(songs_j_loves.energy);
plt.text(0.4, .2, r'$\mu=\ 0.58$', fontsize=18)
plt.xlabel('Energy', fontsize=18)
plt.grid(True)

plt.subplot(334)
sns.distplot(songs_j_loves.loudness);
plt.text(-30, 0.10, r'$\mu=\ -7.40$', fontsize=22)
plt.xlabel('Loudness  ', fontsize=18)
plt.grid(True)

plt.subplot(335)
sns.distplot(songs_j_loves.danceability);
plt.text(0.5, 0.2, r'$\mu=\ 0.69$', fontsize=22)
plt.xlabel('Danceability ', fontsize=18)
plt.grid(True)

plt.subplot(336)
sns.distplot(songs_j_loves.valence);
plt.text(0.4, 0.2, r'$\mu=\ 0.44$', fontsize=22)
plt.xlabel('Valence (positive mood)', fontsize=18)
plt.grid(True)

plt.subplot(337)
sns.distplot(songs_j_loves.acousticness);
plt.text(0.7, 1.0, r'$\mu=\ 0.25$', fontsize=22)
plt.xlabel('Acousticness', fontsize=18)
plt.grid(True)

plt.subplot(338)
sns.distplot(songs_j_loves.release_year);
plt.text(1960, 0.02, r'$\mu=\ 2018$', fontsize=18)
plt.xlim(1950, 2025)
plt.xlabel('Release Year', fontsize=18)
plt.grid(True)


plt.subplot(339)
sns.distplot(songs_j_loves.duration);
# plt.text(0.01, 0.02, r'$\mu=\ 212.80$', fontsize=8)
plt.xlabel('Song Duration (sec)', fontsize=18)
plt.grid(True)

plt.tight_layout(pad=1.0, w_pad=5.0, h_pad=1.0)

#### Comparing the average feature values for 3 individuals favorite songs 

In [None]:
# Visualize distributions of liked tracks
plt.figure(1, figsize=(20, 10),)
# Adjust the subplot layout parameters
plt.subplots_adjust(hspace=0.5, wspace=1.0)

plt.subplot(331)
sns.distplot(songs_j_loves.tempo);
sns.distplot(songs_b_loves.tempo);
sns.distplot(songs_k_loves.tempo);
plt.xlabel('Tempo (BPM)', fontsize=18);
plt.legend(labels = ["J", "B", "K"]);
plt.grid(True)

plt.subplot(332)
sns.distplot(songs_j_loves.popularity);
sns.distplot(songs_b_loves.popularity);
sns.distplot(songs_k_loves.popularity);
plt.xlabel('Popularity', fontsize=18)
plt.legend(labels = ["J", "B", "K"]);
plt.grid(True)

plt.subplot(333)
sns.distplot(songs_j_loves.energy);
sns.distplot(songs_b_loves.energy);
sns.distplot(songs_k_loves.energy);
plt.xlabel('Energy', fontsize=18);
plt.legend(labels = ["J", "B", "K"]);
plt.grid(True)

plt.subplot(334)
sns.distplot(songs_j_loves.loudness);
sns.distplot(songs_b_loves.loudness);
sns.distplot(songs_k_loves.loudness);
plt.xlabel('Loudness  ', fontsize=18);
plt.legend(labels = ["J", "B", "K"]);
plt.grid(True)

plt.subplot(335)
sns.distplot(songs_j_loves.danceability);
sns.distplot(songs_b_loves.danceability);
sns.distplot(songs_k_loves.danceability);
plt.xlabel('Danceability ', fontsize=18);
plt.legend(labels = ["J", "B", "K"]);
plt.grid(True)

plt.subplot(336)
sns.distplot(songs_j_loves.valence);
sns.distplot(songs_b_loves.valence);
sns.distplot(songs_k_loves.valence);
plt.xlabel('Valence (positive mood)', fontsize=18);
plt.legend(labels = ["J", "B", "K"]);
plt.grid(True)

plt.subplot(337)
sns.distplot(songs_j_loves.acousticness);
sns.distplot(songs_b_loves.acousticness);
sns.distplot(songs_k_loves.acousticness);
plt.xlabel('Acousticness', fontsize=18);
plt.legend(labels = ["J", "B", "K"]);
plt.grid(True)

plt.subplot(338)
sns.distplot(songs_j_loves.liveness);
sns.distplot(songs_b_loves.liveness);
sns.distplot(songs_k_loves.liveness);
# plt.xlim(0, 0.02)
plt.xlabel('Liveness', fontsize=18);
plt.legend(labels = ["J", "B", "K"]);
plt.grid(True)


plt.subplot(339)
sns.distplot(songs_j_loves.duration);
sns.distplot(songs_b_loves.duration);
sns.distplot(songs_k_loves.duration);
# plt.text(0.01, 0.02, r'$\mu=\ 212.80$', fontsize=8)
plt.xlabel('Song Duration (sec)', fontsize=18);
plt.legend(labels = ["J", "B", "K"]);
plt.grid(True)
plt.savefig("./eda_chart_BvJvK", dpi = 200)

plt.tight_layout(pad=1.0, w_pad=5.0, h_pad=1.0)

#### Comparing my Top Tracks with the Top USA Tracks

In [None]:
# Visualize distributions of liked tracks
plt.figure(1, figsize=(20, 10),)
# Adjust the subplot layout parameters
plt.subplots_adjust(hspace=0.5, wspace=1.0)

plt.subplot(331)
sns.distplot(songs_j_loves.tempo);
sns.distplot(top_tracks_USA.tempo);
plt.xlabel('Tempo (BPM)', fontsize=18)
plt.grid(True)

plt.subplot(332)
sns.distplot(songs_j_loves.popularity);
sns.distplot(top_tracks_USA.popularity);
plt.xlabel('Popularity', fontsize=18)
plt.grid(True)

plt.subplot(333)
sns.distplot(songs_j_loves.energy);
sns.distplot(top_tracks_USA.energy);
plt.xlabel('Energy', fontsize=18)
plt.grid(True)

plt.subplot(334)
sns.distplot(songs_j_loves.loudness);
sns.distplot(top_tracks_USA.loudness);
plt.xlabel('Loudness  ', fontsize=18)
plt.grid(True)

plt.subplot(335)
sns.distplot(songs_j_loves.danceability);
sns.distplot(top_tracks_USA.danceability);
plt.xlabel('Danceability ', fontsize=18)
plt.grid(True)

plt.subplot(336)
sns.distplot(songs_j_loves.valence);
sns.distplot(top_tracks_USA.valence);
plt.xlabel('Valence (positive mood)', fontsize=18)
plt.grid(True)

plt.subplot(337)
sns.distplot(songs_j_loves.acousticness);
sns.distplot(top_tracks_USA.acousticness);
plt.xlabel('Acousticness', fontsize=18)
plt.grid(True)

plt.subplot(338)
sns.distplot(songs_j_loves.liveness);
sns.distplot(top_tracks_USA.liveness);
# plt.xlim(0, 0.02)
plt.xlabel('Liveness', fontsize=18)
plt.grid(True)


plt.subplot(339)
sns.distplot(songs_j_loves.duration);
sns.distplot(top_tracks_USA.duration);
# plt.text(0.01, 0.02, r'$\mu=\ 212.80$', fontsize=8)
plt.xlabel('Song Duration (sec)', fontsize=18)
plt.grid(True)

plt.tight_layout(pad=1.0, w_pad=5.0, h_pad=1.0)

### Radar chart to compare features

#####  Create a radar chart showing music preferences
Steps:
1. Create a df that contains the features we want to visualize
2. Scale the feature values to facilitate easier visual comparison
3. Create a Series object that contains the mean values of each feature (this is what we will visualize)
4. Create a parameter object that includes the names of the features we want to visualize
5. Create a range object that captures the min and max values for each feature (multiply by 25% for buffer)

In [None]:
# Create a new dataframe that only contains the columns we are interested in visualizing

df_j = songs_j_loves.drop(columns= ['release_year', 'duration', 'mode', 'key', 
                                    'release_date', 'id', 'album', 'artist', 'name'])
print(df_j.columns)

df_b = songs_b_loves.drop(columns= ['release_year', 'duration', 'mode', 'key', 
                                    'release_date', 'id', 'album', 'artist', 'name'])
print(df_b.columns)

df_k = songs_k_loves.drop(columns= ['release_year', 'duration', 'mode', 'key', 
                                    'release_date', 'id', 'album', 'artist', 'name'])
print(df_k.columns)


In [None]:
# For interpretability, scale the feature values using MinMax Scaler (min = 0, max = 1)

#create df for each of our fav songs that we will scale
df_j_scaled = df_j
df_b_scaled  = df_b
df_k_scaled = df_k


scaler = MinMaxScaler()

# transform numerical features that we will use in our models -- focus on just musical attributes
df_j_scaled[['popularity','danceability', 'energy','loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']] = scaler.fit_transform(df_j_scaled[['popularity', 'danceability', 'energy','loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']])

df_b_scaled[['popularity','danceability', 'energy','loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']] = scaler.fit_transform(df_b_scaled[['popularity', 'danceability', 'energy','loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']])

df_k_scaled[['popularity','danceability', 'energy','loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']] = scaler.fit_transform(df_k_scaled[['popularity', 'danceability', 'energy','loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']])



In [None]:
# when we check the means and std, we see that the scaling worked
df_j_scaled.describe()

In [None]:
df_b_scaled.describe()

In [None]:
df_k_scaled.describe()

In [None]:
# Calculate the mean value of each of our song attributes and store in a series object
means_j = df_j_scaled.mean()
print(means_j, "\n")

means_b = df_b_scaled.mean()
print(means_b, "\n")

means_k = df_k_scaled.mean()
print(means_k)


In [None]:
# Create an list object that stores the variable names
params = list(means_j.index)
params

In [None]:
# --------- Create background for spider plot

# create an object that stores the number of variables we are visualizing
N = len(params)

# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

# Initialise the spider plot
ax = plt.subplot(111, polar=True)
#plt.rcParams['figure.facecolor'] = 'white'

ax.set_facecolor("white")

# If you want the first axis to be on top:
#ax.set_theta_offset(pi / 2)
#ax.set_theta_direction(-1)



# Draw one axe per variable + add labels
plt.xticks(angles[:-1], params, color='black', size=10)

# Draw ylabels
ax.set_rlabel_position(90)
plt.yticks([0.25,0.50,0.75], ["0.25","0.5","0.75"], color="dimgray", size=10)
plt.ylim(0,0.75)



# ------- Add each individual plot

#-- J's plot

# We are going to plot the first line of the data frame.
# But we need to repeat the first value to close the circular graph:
values= list(means_j.values)
values += values[:1] # appends the first value to the end of the list 
 
#Plot data
ax.plot(angles, values, linewidth=1.5, linestyle='dashed', label = "J")
# Fill area
ax.fill(angles, values, 'blue', alpha=0.1)


#-- B's Plot 
values = list(means_b.values)
values += values[:1] # append the first value to the list to close the loop
#Plot data
ax.plot(angles, values, linewidth=1.5, linestyle='dashed', label = "B")
# Fill area
ax.fill(angles, values, 'red', alpha=0.1)

#-- K's Plot 
values = list(means_k.values)
values += values[:1] # append the first value to the list to close the loop
#Plot data
ax.plot(angles, values, linewidth=1.5, linestyle='dashed', label = "K")
# Fill area
ax.fill(angles, values, 'green', alpha=0.1)

# Add legend
blue_patch = mpatches.Patch(color='blue', label='J music')
red_patch = mpatches.Patch(color='orange', label = 'B music')
green_patch = mpatches.Patch(color = 'green', label = "K music")
plt.legend(loc = "upper right",
           handles=[blue_patch, red_patch, green_patch], bbox_to_anchor=(0.05, 0.05))

# buffer for axis labels
ax.tick_params(pad=24)

# adjust size of plot
plt.tight_layout()
plt.rcParams["figure.figsize"] = (10,15)
# save the radar plot
plt.savefig("./radar_chart_BvJvK", dpi = 200)

# Show the graph
#plt.show()
