In [1]:
# Importing necessary libraries and settings
import time
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format ='retina'
import random
from functools import reduce
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import oauth2

In [2]:
# Insert your Spotify username and the credentials that you obtained from spotify developer
cid = 'XXXXXX'
secret = 'XXXXXX'
redirect_uri='http://localhost:7777/callback'
username = 'XXXXXX'

In [3]:
# Once the Authorisation is complete, we just need to `sp` to call the APIs
scope = 'user-top-read playlist-modify-private playlist-modify-public'
token = util.prompt_for_user_token(username, scope, client_id=cid, client_secret=secret, redirect_uri=redirect_uri)

if token:
    sp = spotipy.Spotify(auth=token)
else:
    print("Can't get token for", username)

In [4]:
# Getting features for each song
def fetch_audio_features(sp, df):
    playlist = df[['track_id','track_name']] 
    index = 0
    audio_features = []
    
    # Make the API request
    while index < playlist.shape[0]:
        audio_features += sp.audio_features(playlist.iloc[index:index + 50, 0])
        index += 50
    
    # Create an empty list to feed in different charactieritcs of the tracks
    features_list = []
    #Create keys-values of empty lists inside nested dictionary for album
    for features in audio_features:
        features_list.append([features['danceability'],
                              features['acousticness'],
                              features['energy'], 
                              features['tempo'],
                              features['instrumentalness'], 
                              features['loudness'],
                              features['liveness'],
                              features['duration_ms'],
                              features['key'],
                              features['valence'],
                              features['speechiness'],
                              features['mode']
                             ])
    
    df_audio_features = pd.DataFrame(features_list, columns=['danceability', 'acousticness', 'energy','tempo', 
                                                             'instrumentalness', 'loudness', 'liveness','duration_ms', 'key',
                                                             'valence', 'speechiness', 'mode'])
    
    # Create the final df, using the 'track_id' as index for future reference
    df_playlist_audio_features = pd.concat([playlist, df_audio_features], axis=1)
    df_playlist_audio_features.set_index('track_name', inplace=True, drop=True)
    return df_playlist_audio_features

### Getting the songs from all of Spotify's playlists

The following cells are collecting the songs from all of the playlists from the official Spotify account. In this repo, I've curated a dataframe of approximately 10,000 songs. If you would like to add more songs, modify the cells below. I've commented them out because they take a long time to run and are unnecessary if you'd like to use the dataframe I've already built.

In [5]:
# # Getting playlist IDs from each of Spotify's playlists
# playlists = sp.user_playlists('spotify')
# spotify_playlist_ids = []
# while playlists:
#     for i, playlist in enumerate(playlists['items']):
#         spotify_playlist_ids.append(playlist['uri'][-22:])
#     if playlists['next']:
#         playlists = sp.next(playlists)
#     else:
#         playlists = None
# spotify_playlist_ids[:20]

In [6]:
# len(spotify_playlist_ids)

### Getting tracks from Spotify playlists

In [7]:
# Creating a function to get the first 50 tracks IDs from a playlist
def getTrackIDs(playlist_id):
    playlist = sp.user_playlist('spotify', playlist_id)
    for item in playlist['tracks']['items'][:50]:
        track = item['track']
        ids.append(track['id'])
    return

In [8]:
# Creating a function get features of each track from track id
def getTrackFeatures(track_id):
  meta = sp.track(track_id)
  features = sp.audio_features(track_id)

  # meta
  track_id = track_id
  name = meta['name']
  album = meta['album']['name']
  artist = meta['album']['artists'][0]['name']
  release_date = meta['album']['release_date']
  length = meta['duration_ms']
  popularity = meta['popularity']

  # features
  acousticness = features[0]['acousticness']
  danceability = features[0]['danceability']
  energy = features[0]['energy']
  instrumentalness = features[0]['instrumentalness']
  liveness = features[0]['liveness']
  loudness = features[0]['loudness']
  speechiness = features[0]['speechiness']
  tempo = features[0]['tempo']
  time_signature = features[0]['time_signature']

  track = [track_id, name, album, artist, release_date, length, popularity, danceability, acousticness, energy, instrumentalness, liveness, loudness, speechiness, tempo, time_signature]
  return track

The cell below takes about five minutes to run.

In [9]:
# %%time
# # Gathering track ids
# ids = []
# for x in spotify_playlist_ids[:200]:
#     getTrackIDs(x)
# ids[:5]

The cell below takes about 30 minutes to run.

In [10]:
# %%time
# # loop over track ids to get audio features for each track
# tracks = []
# for i in range(len(ids)):
#     try:  
#         track = getTrackFeatures(ids[i])
#         tracks.append(track)
#     except:
#         pass

# # create dataset
# df = pd.DataFrame(tracks, columns = ['track_id', 'name', 'album', 'artist', 'release_date', 'length', 'popularity', 'danceability', 'acousticness', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature'])
# df.head()

In [11]:
# df.to_csv('playlist_songs.csv',index=False)

In [12]:
df = pd.read_csv('data/playlist_songs.csv')
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,7MAibcTli4IisCtbHKrGMh,Leave The Door Open,Leave The Door Open,Bruno Mars,2021-03-05,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4
1,5QO79kh1waicV47BqGRL3g,Save Your Tears,After Hours,The Weeknd,2020-03-20,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4
2,1diS6nkxMQc3wwC4G1j0bh,We're Good,Future Nostalgia (The Moonlight Edition),Dua Lipa,2021-02-11,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4
3,4u4NyuceXP7Uzh7XFJKCr1,Hold On,Hold On,Justin Bieber,2021-03-05,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4
4,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,Astronaut In The Ocean,Masked Wolf,2021-01-06,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4


In [13]:
# Dropping columns that could lead to data leakage
df = df.drop(columns=['name', 'album', 'artist', 'release_date'])
df.head()

Unnamed: 0,track_id,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,7MAibcTli4IisCtbHKrGMh,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4
1,5QO79kh1waicV47BqGRL3g,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4
2,1diS6nkxMQc3wwC4G1j0bh,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4
3,4u4NyuceXP7Uzh7XFJKCr1,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4
4,3Ofmpyhv5UAQ70mENzB277,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4


In [14]:
# Dropping duplicated songs
df = df.drop_duplicates(subset=['track_id'])
df['track_id'].value_counts()

4CRYAPb5vbeP91rAD6ppRM    1
0tMfqwt9V5IZhT8c1QQhZX    1
5ugGAdRJMbbVp9PyLpBSZs    1
1BhtPxwzsPMFXhlfCpkw9B    1
5IjIbGO7lih9CVDBFLCtTT    1
                         ..
3Cgqpo0ASrbFeX7rNgiuZW    1
7EcKCPFNyB3k806vHg7jGE    1
3vzjnDcT3KABBJznjIfJWe    1
3HnDP4FrCcTgbhRdHs7Rvd    1
7ytES33eLYS9WaZLKqWfYM    1
Name: track_id, Length: 8883, dtype: int64

## Getting user's favorite tracks

In [15]:
# Getting top 50 tracks from user
results = sp.current_user_top_tracks(limit=1000, offset=0,time_range='short_term')

In [16]:
# Convert it to Dataframe
track_name = []
track_id = []
artist = []
album = []
duration = []
popularity = []
for i, items in enumerate(results['items']):
        track_name.append(items['name'])
        track_id.append(items['id'])
        artist.append(items["artists"][0]["name"])
        duration.append(items["duration_ms"])
        album.append(items["album"]["name"])
        popularity.append(items["popularity"])

# Create the final df   
df_favourite = pd.DataFrame({ "track_name": track_name, 
                             "album": album, 
                             "track_id": track_id,
                             "artist": artist, 
                             "duration": duration, 
                             "popularity": popularity})

df_favourite.head()

Unnamed: 0,track_name,album,track_id,artist,duration,popularity
0,Tarantino,Almost There,1CzjJzPOCLkZr2oqiOrXjc,LUCKI,114773,39
1,Prada Tune,Almost There,23slkWAmlcxi3XcSwC9RLZ,LUCKI,122813,46
2,Off The Map,After Me,6GNG0YQixWuLE0M5FtrRxY,SoFaygo,141582,62
3,Faith,Faith,0TqNfrOY2IrpFRI2zxsMq4,LUCKI,107467,37
4,Saucin' - Remix,Freewave,1djb4ACCvrs7MPnKwA9w9I,LUCKI,208274,38


In [17]:
%%time
# Getting track features for each song in favorite song dataframe
fav_tracks = []
for track in df_favourite['track_id']:
    try:  
        track = getTrackFeatures(track)
        fav_tracks.append(track)
    except:
        pass

CPU times: user 385 ms, sys: 59.6 ms, total: 445 ms
Wall time: 4.69 s


In [18]:
# Create favorite track wtih audio features dataset
df_fav = pd.DataFrame(fav_tracks, columns = ['track_id', 'name', 'album', 'artist', 'release_date', 'length', 'popularity', 'danceability', 'acousticness', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature'])
df_fav.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,1CzjJzPOCLkZr2oqiOrXjc,Tarantino,Almost There,LUCKI,2020-05-29,114773,39,0.79,0.0495,0.547,0.0,0.114,-9.974,0.108,135.037,4
1,23slkWAmlcxi3XcSwC9RLZ,Prada Tune,Almost There,LUCKI,2020-05-29,122813,46,0.706,0.0263,0.268,0.0,0.239,-13.22,0.287,130.048,4
2,6GNG0YQixWuLE0M5FtrRxY,Off The Map,After Me,SoFaygo,2020-12-17,141582,62,0.472,0.0757,0.847,0.0,0.3,-6.347,0.0419,150.164,4
3,0TqNfrOY2IrpFRI2zxsMq4,Faith,Faith,LUCKI,2020-04-17,107467,37,0.878,0.123,0.584,0.0,0.105,-10.014,0.419,125.069,4
4,1djb4ACCvrs7MPnKwA9w9I,Saucin' - Remix,Freewave,LUCKI,2015-10-15,208274,38,0.916,0.404,0.382,0.0,0.379,-12.46,0.368,129.985,4


In [19]:
# Dropping columns that could lead to data leakage
df_fav = df_fav.drop(columns=['name', 'album', 'artist', 'release_date'])
df_fav.head()

Unnamed: 0,track_id,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,1CzjJzPOCLkZr2oqiOrXjc,114773,39,0.79,0.0495,0.547,0.0,0.114,-9.974,0.108,135.037,4
1,23slkWAmlcxi3XcSwC9RLZ,122813,46,0.706,0.0263,0.268,0.0,0.239,-13.22,0.287,130.048,4
2,6GNG0YQixWuLE0M5FtrRxY,141582,62,0.472,0.0757,0.847,0.0,0.3,-6.347,0.0419,150.164,4
3,0TqNfrOY2IrpFRI2zxsMq4,107467,37,0.878,0.123,0.584,0.0,0.105,-10.014,0.419,125.069,4
4,1djb4ACCvrs7MPnKwA9w9I,208274,38,0.916,0.404,0.382,0.0,0.379,-12.46,0.368,129.985,4


In [20]:
# Checking for duplicates in df_fav
df_fav['track_id'].value_counts()

41KwHpVfT6lmaBG5kegLoe    1
1ePBIQqAIkQ6hTCSCKB1wB    1
2bjwRfXMk4uRgOD9IBYl9h    1
7hVhyJTFsimfq6GlGEPi0G    1
4aEn6zK5li4IWUmqGSrEdp    1
28JCjRpPSZqlmvIDF3SSjo    1
5qUDYpihgX7UkBs15219WT    1
7ibRB2S2WOfPKSvYkhcYtj    1
1pjMiOvZvwIQS1yyMrQZ0M    1
0cdv2x7SWb6iRvKMTWsjHA    1
1djb4ACCvrs7MPnKwA9w9I    1
1Bg2CNZw6S4e9cGWPmi0uI    1
0HMwZSGo2dSdpAEybsxYSG    1
6GNG0YQixWuLE0M5FtrRxY    1
7nPjDGg89BVpS2zuQDclGV    1
4DkZDzcHicZBXCEdA19oWs    1
0CfHAgwAGHXS0bG8MvI5TV    1
0zXnqruuTKhV7dTmbaO52L    1
1w4XLeZi73c56N1C9QcIoB    1
6mvEk4k2zDgm8GbVJlAnyg    1
6P8rBeOxn0ervew8cr0jcD    1
2AlYncTpVHKwHb55F9lF6O    1
53d1tlSLNunnKIQAHpHRSO    1
1CzjJzPOCLkZr2oqiOrXjc    1
2Ge26JkRTyJ96T8KbsCzMI    1
176LvsOHbOXcgJgBBZ8wgs    1
1360nnYaxJzEi6TlVoJgj9    1
0w6webWdhjRKdqJ3DeGgM1    1
0TqNfrOY2IrpFRI2zxsMq4    1
1uJGWwJ0NZYvGD4kbKlOTl    1
6xczQgrsVl3jFO3eEoNMK0    1
2MShy1GSSgbmGUxADNIao5    1
79XrkTOfV1AqySNjVlygpW    1
3V7o9lJBwqrJiC1FiyQRz6    1
26UmIgkm68J3SVvXhqt5St    1
2lLG56qpLP3UbcLuzMvk

In [21]:
# Creating favorite column to use in classification
df_fav['favorite'] = 1
df['favorite'] = 0 

In [22]:
# Checking if both datasets have the same columns
df.columns == df_fav.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

## Preparing dataset for model creation

In [23]:
print(df.shape)
print(df_fav.shape)

(8883, 13)
(50, 13)


In [24]:
# Combining the favorite dataframe with the dataframe of Spotify's songs
combined = pd.concat([df, df_fav])
combined.shape

(8933, 13)

In [25]:
combined.favorite.value_counts()

0    8883
1      50
Name: favorite, dtype: int64

The favorite songs to not favorite songs ratio is imbalanced, I will need to fix this when building the model

In [26]:
# Creating dataframe of favorite songs
df_fav = combined.loc[combined['favorite'] == 1]
df_fav.head()

Unnamed: 0,track_id,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,1CzjJzPOCLkZr2oqiOrXjc,114773,39,0.79,0.0495,0.547,0.0,0.114,-9.974,0.108,135.037,4,1
1,23slkWAmlcxi3XcSwC9RLZ,122813,46,0.706,0.0263,0.268,0.0,0.239,-13.22,0.287,130.048,4,1
2,6GNG0YQixWuLE0M5FtrRxY,141582,62,0.472,0.0757,0.847,0.0,0.3,-6.347,0.0419,150.164,4,1
3,0TqNfrOY2IrpFRI2zxsMq4,107467,37,0.878,0.123,0.584,0.0,0.105,-10.014,0.419,125.069,4,1
4,1djb4ACCvrs7MPnKwA9w9I,208274,38,0.916,0.404,0.382,0.0,0.379,-12.46,0.368,129.985,4,1


In [27]:
# Removing favorite songs from playlist songs
df = combined.loc[combined['favorite'] != 1]
df.shape

(8883, 13)

In [28]:
df_fav.shape

(50, 13)

In [29]:
# Saving these dataframes to use in model creation
df.to_csv('encoded_playlist_songs.csv', index=False)
df_fav.to_csv('favorite_songs.csv', index=False)