In [2]:
# load modules
import numpy as np
import pandas as pd
import json
from os import listdir
from sklearn.model_selection import train_test_split
from itertools import compress
import warnings
import time
import spotipy
import spotipy.util as util
import spotipy.oauth2 as oauth2
from scipy.spatial import distance

In [6]:
# load in data if needed
mp_playlist_data = pd.read_pickle('mp_playlist_data_nf7')

In [7]:
# grab all songs in our dataset
all_songs = set()
for i in range(mp_playlist_data.shape[0]):
    temp_set = set(pd.DataFrame(mp_playlist_data.iloc[i]['tracks'])['track_uri'])
    all_songs = all_songs.union(temp_set) 
all_songs_list = list(all_songs)

In [8]:
print('Number of playlists: {}'.
      format(mp_playlist_data.shape[0]))
print('Average number of tracks per playlist: {:.4f}'.
      format(mp_playlist_data['num_tracks'].mean()))
print('Average durations (ms) of playlists: {:.4f}'.
      format(mp_playlist_data['duration_ms'].mean()))
print('Total number of tracks: {}'.
      format(len(all_songs)))

Number of playlists: 11173
Average number of tracks per playlist: 100.4752
Average durations (ms) of playlists: 23785654.4912
Total number of tracks: 263985


In [9]:
# split the data into a train and test set
test_size = 0.2
mp_playlist_train_df, mp_playlist_test_df = train_test_split(mp_playlist_data, 
                                                             test_size = test_size)

In [10]:
# clean train data
# dictionaries
# dictionary of pid: # number of followers
n_followers_train_dict = {}
# dictionary of pid: dataframe of track info
mp_playlist_train_dict = {}
for i in range(mp_playlist_train_df.shape[0]):
    n_followers_train_dict[mp_playlist_train_df.iloc[i]['pid']] = mp_playlist_train_df.iloc[i]['num_followers']
    mp_playlist_train_dict[mp_playlist_train_df.iloc[i]['pid']] = pd.DataFrame(mp_playlist_train_df.iloc[i]['tracks'])
    

In [303]:
# helper class for indexing a list with a mask
class MaskableList(list):
    def __getitem__(self, index):
        try: return super(MaskableList, self).__getitem__(index)
        except TypeError: return MaskableList(compress(self, index))

In [715]:
# clean test data

# hold track data in test list of lists (series)
mp_playlist_test_list = []

# randomly remove a percentage of tracks from each playlists 
# of the test list
rem_frac = 0.7
# the actual data to test on i.e. make predictions on
mp_playlist_test_list_testON = []

# fill the lists
for i in range(mp_playlist_test_df.shape[0]):
    # create temp variable holding tracks
    temp = pd.DataFrame(mp_playlist_test_df.iloc[i]['tracks'])['track_uri'].values
    # append to growing list
    mp_playlist_test_list.append(temp)
    # mask to create testON list
    msk = np.random.rand(len(temp)) < rem_frac
    # filter
    temp = temp[msk]
    # append to testON list
    mp_playlist_test_list_testON.append(temp)

In [759]:
# this is the algo
class RecSongs():
    '''
    Class to recommend songs. 
    Initializes and stores the 'training' data, and 'fits' the test data,
    an input playlist for which to recommend songs, to the training data.
    '''
    
    # initialization
    def __init__(self, train):
        '''
        train is a dictionary of dataframes
         - key: pid, the playlist id
         - value: dictionary of info on the tracks 
        '''
        
        # the 'training data'
        self.train = train
        
        # has the function been fit?
        self.is_fit = False
        
    # find the songs to recommend
    def fit(self, test, n_songs, flag_weight=False, flag_sec_layer=False, *opts):
        '''
        Function finds songs to recommend for the input playlists from 
        the training data by finding the playlists in the training data that
        are most similar to the input playlist based on the overlap of tracks, 
        then recommendding songs that occur in the train playlist but not in 
        the input playlist. If flag_sec_layer is set to True, the latter 
        recommended songs will be passed through a second layer, which determines
        the most similar songs from the latter recommended songs based on
        Spotify audio features of the songs. 

        Input:
        - test: the input playlist data for which to find similar songs
                - it is a list where each element is a playlists, a list of tracks 
        - n_songs: the number of songs to recommend for each track
        - flag_weight a flag for weighing the similarity measure of song overlap
        - flag_sec_layer: a flag for whether to use the second layer 
        - *opts: an optional argument of weights and number of songs for second layer
        
        Output:
        - list of lists of recommendded songs
        '''
        
        # the first layer of the recommender system
        def layer_one(self, test, n_songs):
            '''
            The first layer of finding the most similar songs
            - similarity measured on song overlap
            - performed for every input playlist of many
            
            *test is an individual playlist, a list of tracks, or more
            specifically, a list of track_uri's
            '''
            
            # dictionary to keep track of how much overlap there is betwwen
            # the input playlist and the training playlists
            song_overlap = {}

            # fill similarity dictionary
            # - also conditional for if weights argument is given
            if self.flag_weight:
                for pid, tracks in self.train.items():
                    song_overlap[pid] = self.weights[pid]*len(set(tracks['track_uri']) & set(test))    
            else:
                for pid, tracks in self.train.items():
                    song_overlap[pid] = len(set(tracks['track_uri']) & set(test))

            # create a sorted list from the dictionary
            # - most similar songs first
            song_overlap_sorted = sorted(song_overlap.items(), key=lambda x: x[1], reverse=True)

            # keep track of songs to recommend
            rec_songs = set()

            # fill the recommend songs
            for i in range(len(song_overlap_sorted)):
                
                # check if we've reached desired number of songs
                if len(rec_songs) > n_songs:
                    break
                
                # grab pid
                pid = song_overlap_sorted[i][0]
                
                songs_to_add = set(self.train[pid]['track_uri']) - set(test)

                # add songs
                # - set operation to not add songs that have already been added
                rec_songs = rec_songs.union(songs_to_add)
            
            # done
            # return list of the set of recommended songs
            return(list(rec_songs))
        
        # second layer 
        def layer_two(recs, test, n_songs_2):
            '''
            Takes in a list of recommended songs and produces the 'most similar'
            recommended songs to the input playlist based on Spotify feature
            
            Input: 
            - recs: the recommended songs from layer_one
            - n_songs_2: the number of songs to output; should be smaller than the number of 
            songs we recommended from layer_one
            - test will be the individual input playlists we are testing against
            
            Output:
            - list of lists of recommended songs 
            '''
            
            # helper function to get the numerics
            def get_numerics(uri_lst):
                '''
                Grabs features of a list of songs from the Spotify API. 

                Input: 
                - uri_list: a list of Spotify uris corresponding to songs
                    * should not exceed length of 50

                Output: 
                - df: a dataframe of the Spotify features for the input songs
                '''
                
                # check input
                if len(uri_lst) > 50: 
                    warnings.warn('Cannot make Spotify API call for more than 50 songs at a tiem.')
                    return(None)

                # the Spotify song features that we want
                columns = ['danceability', 'energy', 'key', 
                           'loudness', 'speechiness', 'acousticness', 
                           'instrumentalness', 'liveness', 'valence', 'tempo']

                # list to store the features 
                lst = []

                # Spotify API call returns a list of dictionaries 
                song_dct = self.spotify.audio_features(uri_lst)

                # fill in list 
                for dct_elt in song_dct:
                    # grab and store features
                    try:
                        numeric_vals = list(dct_elt.values())
                        numeric_valss = numeric_vals[:4] + numeric_vals[5:11]
                        lst.append(numeric_valss)
                    except AttributeError:
                        warnings.warn("AttributeError Encountered. Caution.")
                        return(None)

                # create dataframe to store the data
                df = pd.DataFrame(lst, columns=columns)
                df['track_uri'] = uri_lst

                # done
                return(df)
            
            # helper function to get the distance measure
            def find_closest_songs(df_recs, df_test):
                '''
                Give a dataframe of song feautres for the input playlist, which
                we call 'df_test', we find the simialarity measure i.e. the distance
                between those test songs and the input songs we identified in layer one. 
                
                Input: 
                - df_recs: a dataframe of the song features of the songs we identified in
                the the first layer
                - df_test: a dataframe of the song features of the songs in the input playlist
                '''
                # grab global means of features for the input
                lst_of_avgs = np.mean(df_test.iloc[:,:-1])
                
                # store scores
                scores = np.zeros(df_recs.shape[0])
                
                # fill scores
                for i in range(df_recs.shape[0]):
                    try:
                        scores[i] = distance.euclidean(df_recs.iloc[i].values[:-1], lst_of_avgs)
                    except ValueError:
                        warnings.warn("ValueError Encountered. Caution.")
                        scores[i] = np.infty
                
                songs = df_recs['track_uri'].iloc[np.argsort(scores)].values
                
                return(songs) 
            
            # LINK SPOTIFY---------------------------------------------------
            credentials = oauth2.SpotifyClientCredentials(
                client_id='e157f2ab692247ba80afdbec86343c62',
                client_secret='5b5f50549b4443139fe147e25674a0ee')
            token = credentials.get_access_token()
            self.spotify = spotipy.Spotify(auth=token)
            #----------------------------------------------------------------
            
            # store the 'test' and "train' dataframes
            df_test = pd.DataFrame()
            df_recs = pd.DataFrame()
            
            # lengths
            len_test = len(test)
            len_recs = len(recs)
            
            # grab features for the test test
            step = 50
            # start beginning index
            beg_ind = 0
            # run for the test set
            for ind in list(range(step,len_test,step)) + [len_test]:
                # stores list of 50 songs
                temp_list = test[beg_ind:ind]

                # obtain song features
                temp_feats = get_numerics(temp_list)
                if type(temp_feats) != pd.core.frame.DataFrame:
                    warnings.warn('None returned. Caution.')
                else:
                    df_test = df_test.append(temp_feats)
                # move index
                beg_ind = ind
                
                # wait
                # wait two seconds
                time.sleep(1)
            
            # re-start beginning index
            beg_ind = 0
            # run for the rec set
            for ind in list(range(step,len_recs,step)) + [len_recs]:
                # stores list of 50 songs
                temp_list = recs[beg_ind:ind]
                # obtain song features
                temp_feats = get_numerics(temp_list)
                if type(temp_feats) != pd.core.frame.DataFrame:
                    warnings.warn('None returned. Caution.')
                else:
                    df_recs = df_recs.append(temp_feats)
                # move index
                beg_ind = ind
                
                # wait
                # wait two seconds
                time.sleep(1)
            
            # find the closest songs
            songs = find_closest_songs(df_recs, df_test)[:n_songs_2]
            
            # done
            return(songs)
          
        # weight handling
        self.flag_weight = flag_weight
        # second layer handling
        self.flag_sec_layer = flag_sec_layer
        
        # two flags
        if self.flag_weight and self.flag_sec_layer:
            # length needs to match
            if len(opts) != 2:
                warnings.warn('Two flags: make sure flags and optional arguments agree!')
                return(None)
            else:
                # type of first input
                if type(opts[0]) == dict:
                    # check length of weights
                    if len(opts[0]) != len(self.train):
                        warnings.warn('Weights and input train playlists do not have the same dimension.')
                        return(None)
                    else: 
                        self.weights = opts[0]
                else:
                    warnings.warn('Weight should be a dict!')
                
                # second layer conditionals
                if opts[1] > n_songs:
                    warnings.warn('Second layer of songs to recommend does not agree with the first layer.')
                    return(None)
                else:
                    # store value
                    n_songs_2 = opts[1]
                    
        # just one flag
        elif self.flag_weight or self.flag_sec_layer:
            # check length
            if len(opts) != 1:
                warnings.warn('One flag: make sure flags and optional arguments agree!')
                return(None)
            else:
                # assing opt args
                if self.flag_weight:
                    self.weights = opts[0]
                else:
                    n_songs_2 = opts[0]
        else:
            # no flag
            if len(opts) != 0:
                warnings.warn('No flag - additional optional arguments ignored!')

        # store the test data for use later on
        self.test = test
        
        # list of lists of recommended songs
        rec_song_lists = []
        
        # fill the list of recommended songs
        # perform layer_one for each input playlist
        for i in range(len(self.test)):
            # append the located list to the list of lists
            rec_song_lists.append(layer_one(self, self.test[i], n_songs))
        
        # store the recommendations
        self.rec_song_lists = rec_song_lists
        
        # update is_fit 
        self.is_fit = True
        
        # perform second layer optional on the list of already recommended songs
        if self.flag_sec_layer:
            for i in range(len(self.test)):
            #for i in range(1):
                self.rec_song_lists[i] = layer_two(self.rec_song_lists[i], self.test[i], n_songs_2)
            
        return(self.rec_song_lists)
    
    # calculate accuracy
    def rec_acc(self, true):
        '''
        Calculates the accuracy of the fitted recommendation scores. 
        
        'True' is the actual FULL playlist data i.e. the list of playlists before any of 
        the component tracks were removed. 
        '''
        
        # ceck if we have fit the algo first i.e.
        # if is_fit is false then the fit function has not been called, meaning
        # we cannot call the accuracy function until the fit function has been called
        if not self.is_fit:
            # raise warning
            warnings.warn('You need to fit the object first!')
            
            # return nothing
            return(None)
        
        else:
            
            # check length of input
            if len(self.rec_song_lists) != len(true):
                warnings.warn('Make sure the number of playlists match for inputs!')
            else:
                
                # store the accuracy scores
                acc_scores = []
                
                # fill the scores
                for i in range(len(self.rec_song_lists)):
                    # how many did we get correct
                    overlap = len(set(self.rec_song_lists[i]) & set(true[i]))
                    
                    # how many songs were missing i.e. how many did we remove
                    missing = len(set(true[i]) - set(self.test[i]))
                    if missing == 0:
                        missing = np.infty
                    
                    acc_scores.append(overlap/missing)
                
                # done
                return(acc_scores)


In [717]:
# takes a few soconds
r1 = RecSongs(mp_playlist_train_dict)
a1 = r1.fit(mp_playlist_test_list_testON, 300)
np.mean(r1.rec_acc(mp_playlist_test_list))

0.2774400395032961

In [654]:
r2 = RecSongs(mp_playlist_train_dict)
a2 = r2.fit(mp_playlist_test_list_testON, 300, True, False, n_followers_train_dict)
np.mean(r2.rec_acc(mp_playlist_test_list))

0.15795265484641252

In [736]:
# takes a few soconds
r3 = RecSongs(mp_playlist_train_dict)
a3 = r3.fit(mp_playlist_test_list_testON, 400, False, True, 300)
np.mean(r3.rec_acc(mp_playlist_test_list))

0.20594089574115876

In [761]:
# takes a few soconds
r4 = RecSongs(mp_playlist_train_dict)
a4 = r4.fit(mp_playlist_test_list_testON, 400, True, True, n_followers_train_dict, 300)
np.mean(r4.rec_acc(mp_playlist_test_list))

0.11546202352679169