In [1]:
import time
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format ='retina'
import random
from functools import reduce
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import oauth2

In [2]:
# Insert your Spotify username and the credentials that you obtained from spotify developer
cid = 'XXXXX'
secret = 'XXXXXX'
redirect_uri='http://localhost:7777/callback'
username = 'XXXXXX'

In [3]:
# Once the Authorisation is complete, we just need to `sp` to call the APIs
scope = 'user-top-read playlist-modify-private playlist-modify-public'
token = util.prompt_for_user_token(username, scope, client_id=cid, client_secret=secret, redirect_uri=redirect_uri)

if token:
    sp = spotipy.Spotify(auth=token)
else:
    print("Can't get token for", username)

In [4]:
# Getting features for each song
def fetch_audio_features(sp, df):
    playlist = df[['track_id','track_name']] 
    index = 0
    audio_features = []
    
    # Make the API request
    while index < playlist.shape[0]:
        audio_features += sp.audio_features(playlist.iloc[index:index + 50, 0])
        index += 50
    
    # Create an empty list to feed in different charactieritcs of the tracks
    features_list = []
    #Create keys-values of empty lists inside nested dictionary for album
    for features in audio_features:
        features_list.append([features['danceability'],
                              features['acousticness'],
                              features['energy'], 
                              features['tempo'],
                              features['instrumentalness'], 
                              features['loudness'],
                              features['liveness'],
                              features['duration_ms'],
                              features['key'],
                              features['valence'],
                              features['speechiness'],
                              features['mode']
                             ])
    
    df_audio_features = pd.DataFrame(features_list, columns=['danceability', 'acousticness', 'energy','tempo', 
                                                             'instrumentalness', 'loudness', 'liveness','duration_ms', 'key',
                                                             'valence', 'speechiness', 'mode'])
    
    # Create the final df, using the 'track_id' as index for future reference
    df_playlist_audio_features = pd.concat([playlist, df_audio_features], axis=1)
    df_playlist_audio_features.set_index('track_name', inplace=True, drop=True)
    return df_playlist_audio_features

### Getting the songs from all of Spotify's playlists

In [5]:
# Getting playlist IDs from each of Spotify's playlists
playlists = sp.user_playlists('spotify')
spotify_playlist_ids = []
while playlists:
    for i, playlist in enumerate(playlists['items']):
        spotify_playlist_ids.append(playlist['uri'][-22:])
    if playlists['next']:
        playlists = sp.next(playlists)
    else:
        playlists = None
spotify_playlist_ids[:20]

['37i9dQZF1DXcBWIGoYBM5M',
 '37i9dQZF1DX0XUsuxWHRQd',
 '37i9dQZF1DX1lVhptIYRda',
 '37i9dQZF1DX10zKzsJ2jva',
 '37i9dQZF1DX4JAvHpjipBk',
 '37i9dQZF1DX4sWSpwq3LiO',
 '37i9dQZF1DX4SBhb3fqCJd',
 '37i9dQZF1DWXRqgorJj26U',
 '37i9dQZF1DX4dyzvuaRJ0n',
 '37i9dQZF1DXcF6B6QPhFDv',
 '37i9dQZF1DWXJfnUiYjUKT',
 '37i9dQZF1DXcRXFNfZr7Tp',
 '37i9dQZF1DX4o1oenSJRJd',
 '37i9dQZF1DXbTxeAdrVG2l',
 '37i9dQZF1DX4UtSsGT1Sbe',
 '37i9dQZF1DWTJ7xPn4vNaz',
 '37i9dQZF1DXaKIA8E7WcJj',
 '37i9dQZF1DWSV3Tk4GO2fq',
 '37i9dQZF1DWTwnEm1IYyoj',
 '37i9dQZF1DX2A29LI7xHn1']

In [6]:
len(spotify_playlist_ids)

1398

### Getting tracks from Spotify playlists

In [7]:
# Creating a function to get the first 50 tracks IDs from a playlist
def getTrackIDs(playlist_id):
    playlist = sp.user_playlist('spotify', playlist_id)
    for item in playlist['tracks']['items'][:50]:
        track = item['track']
        ids.append(track['id'])
    return

In [8]:
# Creating a function get features of each track from track id
def getTrackFeatures(track_id):
  meta = sp.track(track_id)
  features = sp.audio_features(track_id)

  # meta
  track_id = track_id
  name = meta['name']
  album = meta['album']['name']
  artist = meta['album']['artists'][0]['name']
  release_date = meta['album']['release_date']
  length = meta['duration_ms']
  popularity = meta['popularity']

  # features
  acousticness = features[0]['acousticness']
  danceability = features[0]['danceability']
  energy = features[0]['energy']
  instrumentalness = features[0]['instrumentalness']
  liveness = features[0]['liveness']
  loudness = features[0]['loudness']
  speechiness = features[0]['speechiness']
  tempo = features[0]['tempo']
  time_signature = features[0]['time_signature']

  track = [track_id, name, album, artist, release_date, length, popularity, danceability, acousticness, energy, instrumentalness, liveness, loudness, speechiness, tempo, time_signature]
  return track

The cells below take a long time to run, so I commented them out.

In [9]:
# %%time
# # Gathering track ids
# ids = []
# for x in spotify_playlist_ids[:200]:
#     getTrackIDs(x)
# ids[:5]

CPU times: user 1.51 s, sys: 134 ms, total: 1.64 s
Wall time: 29.6 s


['7MAibcTli4IisCtbHKrGMh',
 '5QO79kh1waicV47BqGRL3g',
 '1diS6nkxMQc3wwC4G1j0bh',
 '4u4NyuceXP7Uzh7XFJKCr1',
 '3Ofmpyhv5UAQ70mENzB277']

In [10]:
# %%time
# # loop over track ids to get audio features for each track
# tracks = []
# for i in range(len(ids)):
#     try:  
#         track = getTrackFeatures(ids[i])
#         tracks.append(track)
#     except:
#         pass

# # create dataset
# df = pd.DataFrame(tracks, columns = ['track_id', 'name', 'album', 'artist', 'release_date', 'length', 'popularity', 'danceability', 'acousticness', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature'])
# df.head()

CPU times: user 42.5 s, sys: 6.26 s, total: 48.7 s
Wall time: 31min 57s


Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,7MAibcTli4IisCtbHKrGMh,Leave The Door Open,Leave The Door Open,Bruno Mars,2021-03-05,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4
1,5QO79kh1waicV47BqGRL3g,Save Your Tears,After Hours,The Weeknd,2020-03-20,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4
2,1diS6nkxMQc3wwC4G1j0bh,We're Good,Future Nostalgia (The Moonlight Edition),Dua Lipa,2021-02-11,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4
3,4u4NyuceXP7Uzh7XFJKCr1,Hold On,Hold On,Justin Bieber,2021-03-05,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4
4,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,Astronaut In The Ocean,Masked Wolf,2021-01-06,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4


In [11]:
# df.to_csv('playlist_songs.csv',index=False)

In [12]:
df = pd.read_csv('playlist_songs.csv')
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,7MAibcTli4IisCtbHKrGMh,Leave The Door Open,Leave The Door Open,Bruno Mars,2021-03-05,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4
1,5QO79kh1waicV47BqGRL3g,Save Your Tears,After Hours,The Weeknd,2020-03-20,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4
2,1diS6nkxMQc3wwC4G1j0bh,We're Good,Future Nostalgia (The Moonlight Edition),Dua Lipa,2021-02-11,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4
3,4u4NyuceXP7Uzh7XFJKCr1,Hold On,Hold On,Justin Bieber,2021-03-05,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4
4,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,Astronaut In The Ocean,Masked Wolf,2021-01-06,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4


In [13]:
df.shape

(9769, 16)

## Getting user's favorite tracks

In [14]:
# Getting top 1000 tracks from user
results = sp.current_user_top_tracks(limit=1000, offset=0,time_range='short_term')

In [15]:
# Convert it to Dataframe
track_name = []
track_id = []
artist = []
album = []
duration = []
popularity = []
for i, items in enumerate(results['items']):
        track_name.append(items['name'])
        track_id.append(items['id'])
        artist.append(items["artists"][0]["name"])
        duration.append(items["duration_ms"])
        album.append(items["album"]["name"])
        popularity.append(items["popularity"])

# Create the final df   
df_favourite = pd.DataFrame({ "track_name": track_name, 
                             "album": album, 
                             "track_id": track_id,
                             "artist": artist, 
                             "duration": duration, 
                             "popularity": popularity})

df_favourite.head()

Unnamed: 0,track_name,album,track_id,artist,duration,popularity
0,Feels Like Death,Feels Like Death,03vMyCyCK7pVWjC1i1zur0,Levi Carter,237505,44
1,Glory Boy,Freewave 3,5LpnrXjrt0BOU0iOGH78UN,LUCKI,111048,40
2,Tarantino,Almost There,1CzjJzPOCLkZr2oqiOrXjc,LUCKI,114773,38
3,Faith,Faith,0TqNfrOY2IrpFRI2zxsMq4,LUCKI,107467,36
4,Left 4 Dead,Days B4 III,4SRBv9M0wIPIw916zWfhkU,LUCKI,139130,39


In [16]:
%%time
fav_tracks = []
for track in df_favourite['track_id']:
    try:  
        track = getTrackFeatures(track)
        fav_tracks.append(track)
    except:
        pass

CPU times: user 234 ms, sys: 35 ms, total: 269 ms
Wall time: 9.53 s


In [17]:
# Create favorite track wtih audio features dataset
df_fav = pd.DataFrame(fav_tracks, columns = ['track_id', 'name', 'album', 'artist', 'release_date', 'length', 'popularity', 'danceability', 'acousticness', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature'])
df_fav.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,03vMyCyCK7pVWjC1i1zur0,Feels Like Death,Feels Like Death,Levi Carter,2018-04-24,237505,44,0.755,0.337,0.626,0.0,0.137,-6.247,0.293,110.088,4
1,5LpnrXjrt0BOU0iOGH78UN,Glory Boy,Freewave 3,LUCKI,2019-02-15,111048,40,0.784,0.0966,0.519,0.0109,0.0748,-9.868,0.096,129.953,3
2,1CzjJzPOCLkZr2oqiOrXjc,Tarantino,Almost There,LUCKI,2020-05-29,114773,38,0.79,0.0495,0.547,0.0,0.114,-9.974,0.108,135.037,4
3,0TqNfrOY2IrpFRI2zxsMq4,Faith,Faith,LUCKI,2020-04-17,107467,36,0.878,0.123,0.584,0.0,0.105,-10.014,0.419,125.069,4
4,4SRBv9M0wIPIw916zWfhkU,Left 4 Dead,Days B4 III,LUCKI,2019-10-25,139130,39,0.719,0.111,0.5,0.000416,0.0992,-9.654,0.318,137.924,4


In [18]:
df_fav['favorite'] = 1
df['favorite'] = 0 

In [19]:
# Checking if both datasets have the same columns
df.columns == df_fav.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

## Preparing dataset for model creation

### Changing cateogorical variables to numerical

In [20]:
print(df.shape)
print(df_fav.shape)

(9769, 17)
(50, 17)


In [21]:
combined = pd.concat([df, df_fav])
combined.shape

(9819, 17)

In [22]:
combined.favorite.value_counts()

0    9769
1      50
Name: favorite, dtype: int64

The favorite songs to not favorite songs are imbalanced, I will need to fix this when building the model

In [23]:
categoricals = ['name', 'album', 'artist', 'release_date']
for cat in categoricals:
    sorted_df = sorted(set(combined[cat]))
    ordinal_encoding = combined[cat].replace(dict(zip(sorted_df, range(1, len(sorted_df) + 1))))
    combined[cat] = ordinal_encoding
df = combined
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,7MAibcTli4IisCtbHKrGMh,4008,3340,581,2524,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4,0
1,5QO79kh1waicV47BqGRL3g,6038,212,3746,2293,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4,0
2,1diS6nkxMQc3wwC4G1j0bh,8118,2222,1091,2505,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4,0
3,4u4NyuceXP7Uzh7XFJKCr1,3096,2743,1917,2524,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4,0
4,3Ofmpyhv5UAQ70mENzB277,542,433,2439,2478,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4,0


In [24]:
# Creating dataframe of favorite songs with categorical encodings
df_fav = df.loc[df['favorite'] == 1]
df_fav.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,03vMyCyCK7pVWjC1i1zur0,2360,2039,2181,1956,237505,44,0.755,0.337,0.626,0.0,0.137,-6.247,0.293,110.088,4,1
1,5LpnrXjrt0BOU0iOGH78UN,2723,2186,2107,2088,111048,40,0.784,0.0966,0.519,0.0109,0.0748,-9.868,0.096,129.953,3,1
2,1CzjJzPOCLkZr2oqiOrXjc,7094,300,2107,2335,114773,38,0.79,0.0495,0.547,0.0,0.114,-9.974,0.108,135.037,4,1
3,0TqNfrOY2IrpFRI2zxsMq4,2269,1971,2107,2310,107467,36,0.878,0.123,0.584,0.0,0.105,-10.014,0.419,125.069,4,1
4,4SRBv9M0wIPIw916zWfhkU,4014,1489,2107,2211,139130,39,0.719,0.111,0.5,0.000416,0.0992,-9.654,0.318,137.924,4,1


In [25]:
# Removing favorite songs from playlist songs
df = df.loc[df['favorite'] != 1]
df.shape

(9769, 17)

In [26]:
df_fav.shape

(50, 17)

In [27]:
# # Saving these dataframes to use in model creation
# df.to_csv('encoded_playlist_songs.csv', index=False)
# df_fav.to_csv('favorite_songs.csv', index=False)