In [13]:
import spotipy
import spotipy.util as util
import pandas as pd 
import numpy as np
import copy
import sklearn
from scipy.spatial import distance_matrix
# from .pearing_classes import Playlist_Base
# from .pearing_core_functions import get_track_distances, create_playlist
import requests
import json


def Pear_Playlists(playlist_name_1, playlist_name_2, access_token, user_id):
    ### INITIALIZE THINGS
    
    prev_time = time.time()
    
    p1_base = Playlist_Base(playlist_name = playlist_name_1, user_id = user_id, access_token = access_token)
    
    cur_time = time.time()
    print('p1_base created: ', cur_time - prev_time)
    prev_time = cur_time
    
    p2_base = Playlist_Base(playlist_name = playlist_name_2, user_id = user_id, access_token = access_token)
    
    cur_time = time.time()
    print('p2_base created: ', cur_time - prev_time)
    prev_time = cur_time

    
    playlist_bases = [p1_base, p2_base]
    
    track_distances = get_track_distances(playlist_bases)
    
    ### FIND TRACKS OVERVIEW
    # Look at each recommended track
    # Store each track in a df, with each column being the distance to one of the playlists


    # Get a list of all track_ids in each playlist to help with sorting later
    p0_track_ids = playlist_bases[0].orig_playlist_tracks_df['track_id']
    p1_track_ids = playlist_bases[1].orig_playlist_tracks_df['track_id']




    # Make 3 lists to insert into data frame - PFI: need a more robust solution for more playlists
    track_ids = [None]*len(track_distances)
    p0_dist = [None]*len(track_distances)
    p1_dist = [None]*len(track_distances)
    for index, rec_track in track_distances.iterrows():
        # The index value of each 'rec_track' is every column in the df (aka all original track_ids)
        # We want to subset this to the specific playlists so that we can determine distance to each playlist
        track_ids[index] = rec_track['track_id']

        # the track ids in p0_track_ids should match the columns (aka index) of rec_track
        # Only look at those, then get avg dist
        p0_track_distances = rec_track[rec_track.index.isin(p0_track_ids)]
        p0_dist[index] = p0_track_distances.mean()

        p1_track_distances = rec_track[rec_track.index.isin(p1_track_ids)]
        p1_dist[index] = p1_track_distances.mean()

    # Now find avg distane of each song to p0 and p1 clusters
    cols = ['track_id', 'p0', 'p1']
    rec_tracks_to_playlists = pd.DataFrame({'track_id' : track_ids, 'p0': p0_dist, 'p1' : p1_dist})
    rec_tracks_to_playlists['avg_dist'] = rec_tracks_to_playlists.mean(axis = 1)

    # Grab 20 closest songs
    final_tracks = rec_tracks_to_playlists.sort_values(by = ['avg_dist']).iloc[0:20]
    
    ## BUILD PLAYLIST
    sp = spotipy.Spotify(auth=access_token[7:])

    # Spotipy doesn't return the playlist_id so use custom function (thanks Simon Quick)
    playlist_id = create_playlist(access_token, user_id.encode(), "SpotiPear_Playlist")
    #sp.user_playlist_create(user_id, "SpotiPear_Playlist", public = True)
    
    # Now let's add some tracks
    track_ids_to_add = final_tracks['track_id']
    
    sp.user_playlist_add_tracks(user_id, playlist_id, track_ids_to_add)

    
    cur_time = time.time()
    print('playlists joined: ', cur_time - prev_time)
    prev_time = cur_time





In [138]:
import spotipy
#from .pearing_core_functions import get_tracks_from_playlist, get_track_features, get_similar_tracks_from_playlist


# Want an object that encapsulates our API pull of tracks and subsequent API pull of recommendations and features
# Playlist_Base Class
# Has:
# MUST BE SPECIFIED AT CREATE TIME

# user_id - spotify user id
# access_token - returned from spotify api

# playlist_name - string

# From that generate:
# orig_playlist_dict
# orig_playlist_tracks_df - includes features
# rec_tracks_df - includes features


class Playlist_Base():
    def __init__(self, playlist_name = '', user_id = '', access_token = ''):
        prev_time = time.time()
        
        self.playlist_name = playlist_name
        self.user_id = user_id
        self.access_token = access_token
        # Access_token from spotify_api has 'Bearer ' in front
        # Spotipy token built on not having Bearer, so ditch that
        self.sp_token = access_token[7:]
        try:
            sp = spotipy.Spotify(auth=self.sp_token)
            all_api_playlists = sp.user_playlists(self.user_id)
            self.spotipy_connection = sp
        except:
            print("Something went wrong hitting spotify API - did you provide the right token and username?")
        sp = spotipy.Spotify(auth=self.sp_token)
        all_api_playlists = sp.user_playlists(self.user_id)
        
        
#         cur_time = time.time()
#         print('all initialization stuff: ', cur_time - prev_time)
#         prev_time = cur_time
        
        self.generate_from_playlist_name(playlist_name)
        
#         cur_time = time.time()
#         print('self.generate_from_playlist done: ', cur_time - prev_time)
#         prev_time = cur_time
    
    # takes a playlist_name and populates all other relevant variables of object
    # If a new playlist name is passed, it will overwrite all values
    def generate_from_playlist_name(self, playlist_name):
        prev_time = time.time()
        
        self.playlist_name = playlist_name
        # Get all user's playlists from API 
        sp = spotipy.Spotify(auth=self.sp_token)
        all_api_playlists = sp.user_playlists(self.user_id)
        
        for api_playlist in all_api_playlists['items']:
            if(api_playlist['name'] == playlist_name):
                self.orig_playlist_dict = api_playlist
        
        cur_time = time.time()
        print('all initialization stuff: ', cur_time - prev_time)
        prev_time = cur_time
        
        # Now have the original playlist - if a user has 2 playlists of same name this will pick whichever playlist appears last in the list
        
        
        ### GET FEATURES OF ORIGINAL PLAYLIST
        orig_playlist_tracks_tmp = get_tracks_from_playlist(self.user_id, self.orig_playlist_dict, self.spotipy_connection)
        
        cur_time = time.time()
        print('orig_playlist_tracks_tmp: ', cur_time - prev_time)
        prev_time = cur_time
        
        self.orig_playlist_tracks_df = get_track_features(orig_playlist_tracks_tmp, self.access_token, self.user_id)
        
        
        cur_time = time.time()
        print('orig_playlist_tracks_df: ', cur_time - prev_time)
        prev_time = cur_time
        
        ### CREATE RECS FROM PLAYLIST -- PFI: could make this function much more robust, look at artist/genre too
        
        # Get recommended tracks as dict
        rec_tracks_tmp = get_similar_tracks_from_playlist(self.user_id, self.orig_playlist_dict, self.access_token)
        
        cur_time = time.time()
        print('rec_tracks_tmp: ', cur_time - prev_time)
        prev_time = cur_time
        
        
        self.rec_tracks_df = get_track_features(rec_tracks_tmp, self.access_token, self.user_id)
        
        cur_time = time.time()
        print('rec_tracks_df: ', cur_time - prev_time)
        prev_time = cur_time
        

        


In [132]:

import spotipy

import pandas as pd 
import numpy as np
import copy

import sklearn
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance_matrix
import requests
import json
import math



# Given a username and a playlist_dict, generate a dataframe of 'similar' songs
#PFI: This function could be way more robust, looking at a artists/genres as well as randomizing/subsampling to improve speed

# PFI: Get Recommendations off of Recommendations (probably only want to do recursively once)
def get_similar_tracks_from_playlist(user_id, playlist_dict, access_token):
    api_headers = {'Authorization': access_token}
    api_url = 'https://api.spotify.com/v1/playlists/' + playlist_dict['id'] + '/tracks'
    api_get_response = requests.get(api_url, headers = api_headers )
    tracks_json = api_get_response.json()

    track_ids = [None]*len(tracks_json['items'])
    artist_ids = [None]*len(tracks_json['items'])
    for i, item in enumerate (tracks_json['items']):
        track_ids[i] = tracks_json['items'][i]['track']['id']
        artist_ids[i] = tracks_json['items'][i]['track']['artists'][0]['id']

    track_ids = [x for x in track_ids if x != None]
    artist_ids = [x for x in artist_ids if x != None]
    
    return get_recs_from_track_ids (track_ids, access_token)

def get_recs_from_track_ids (track_ids, access_token):
    api_headers = {'Authorization': access_token}
    api_url_base = 'https://api.spotify.com/v1/recommendations?seed_tracks=' 
    rec_tracks_list = [None]*int((len(playlist_var_track_ids)/5)-1)
    
    # Can only get recs from 5 tracks at a time (get 20 back from API)
    # Store each set of 20 in a list & combine later
    for i in range(0,len(rec_tracks_list)):
        api_url = api_url_base + ','.join(track_ids[i*5:(i+1)*5])
        api_get_response = requests.get(api_url, headers = api_headers )
        tracks_json = api_get_response.json()
        cur_rec_tracks = [None]*20
        for j in range(0,20):
            cur_rec_tracks[j] = track_recs_json['tracks'][j]['id']
        rec_tracks_list[i] = cur_rec_tracks
#        rec_tracks_list[i] = spotipy_connection.recommendations(seed_tracks = playlist_var_track_ids[i*5:(i+1)*5])
        
    # rec_tracks_list is a list of lists, each sublist has a track
    # extract out to a single list
    # be default each recommendation call to API sends back 20 recs
    rec_tracks_list_full = [None]*20*len(rec_tracks_list)

    # Iterate over all sets of recommendations from groups of 5 SpotifyIDs
    for i, rec_set in enumerate(rec_tracks_list):
        # Iterate over every track
        for j, track_id in enumerate(rec_set):
  
            rec_tracks_list_full[i*20 + j] = track_id
            

    all_track_ids = pd.DataFrame(np.array(rec_tracks_list_full))
    all_track_ids.columns=['track_id']


    return all_track_ids
    


# Get the features for every song in a dataframe
def get_track_features(track_df, access_token, user_id):

    cols_to_keep = list(['track_id','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'])
    
    # Can only get up to 100 songs at a time from API
    # make empty df, iterate over every 100 songs and append
    track_features_df = pd.DataFrame(columns = cols_to_keep)
    track_list = track_df['track_id'].tolist()
    api_headers = {'Authorization': access_token}
    for i in range(0,(math.ceil(len(track_list)/100))):
        track_list_tmp = track_list[i*100:(i+1)*100]
        api_url = 'https://api.spotify.com/v1/audio-features/?ids=' + ','.join(track_list_tmp)
        api_get_response = requests.get(api_url, headers = api_headers )
        track_features_json = api_get_response.json()
        track_features_df_tmp = pd.DataFrame(track_features_json['audio_features'])
        track_features_df_tmp['track_id'] = track_features_df_tmp['id']
        track_features_df_tmp = track_features_df_tmp[cols_to_keep]
        track_features_df = track_features_df.append(track_features_df_tmp)

    return track_features_df



# Given a playlist (in json format) (and the username of its owner) return a dataframe of the track IDs

def get_tracks_from_playlist(username, playlist, spotipy_connection):
    
    # Do some stuff to account for the hierarchical structure of what we get back
    top_level_tracks = spotipy_connection.user_playlist(username, playlist['id'],
                    fields="tracks,next")

    second_level_tracks = top_level_tracks['tracks']
    
    # Initialize empty list for track_ids
    playlist_track_ids = [None]*len(second_level_tracks['items'])
    
    #PFI: could do something similar for artist/genre and Pear on those as well
    #playlist_var_artist_ids = [None]*len(second_level_tracks['items'])
    #playlist_1_genre_ids = [None]*len(second_level_tracks['items'])
    
    for i, item in enumerate(second_level_tracks['items']):
        cur_track = item['track']
        playlist_track_ids[i] = cur_track['id']
        #playlist_var_artist_ids[i] = cur_track['artists'][0]['id']


    # Sometimes we get a track with a "none" value, get rid of these 
    playlist_track_ids = [x for x in playlist_track_ids if x != None]
    # Make it a DF
    to_return = pd.DataFrame(np.array(playlist_track_ids))
    to_return.columns=['track_id']
    return to_return





# Given a set of chosen playlist names, and the returned API call of all the user's playlists (dict of each playlist)
# returns a list of individual playlist (dict) objects matching the names provided
def get_api_playlists_from_names(chosen_playlist_names, all_api_playlists):
    # Initialize List to place playlists in
    chosen_api_playlists = [None]*len(chosen_playlist_names)
    
    # Iterate over each name, then all items in api, then check for match (PSI: double for loops are bad?)
    for i, playlist_name in enumerate(chosen_playlist_names):
        for api_playlist in all_api_playlists['items']:
            if(api_playlist['name'] == playlist_name):
                chosen_api_playlists[i] = api_playlist
    
    return chosen_api_playlists 



#Given set of Playlist_bases objects, get a distance matrix of all tracks to all other tracks (will be redundancies)
# Assumes the rec_tracks_df of all base playlists has the same columns
def get_track_distances(playlist_bases):

    # Need a df with all tracks as rows
    # And All features as columns
    
    # Create empty data frame to union results onto
    all_rec_tracks_tmp = pd.DataFrame(columns = playlist_bases[0].rec_tracks_df.columns)
    all_orig_tracks_tmp = pd.DataFrame(columns = playlist_bases[0].orig_playlist_tracks_df.columns)
    
    for playlist_base in playlist_bases:
        rec_tracks_df = playlist_base.rec_tracks_df
        all_rec_tracks_tmp = all_rec_tracks_tmp.append(rec_tracks_df)
        orig_playlist_tracks_df = playlist_base.orig_playlist_tracks_df
        all_orig_tracks_tmp = all_orig_tracks_tmp.append(orig_playlist_tracks_df)
        
    # Mode & Key did bad stuff in early analysis so remove them
    all_rec_tracks = all_rec_tracks_tmp.reset_index().drop(['index','mode', 'key'], axis = 1)
    all_orig_tracks = all_orig_tracks_tmp.reset_index().drop(['index','mode', 'key'], axis = 1)
    
    
    # We combined all dataframes into 1 so that we could scale them
    # We can't individually scale the dataframes separately (avg of df1 is 3, avg of df2 is 4 but those both individually scale to 0)
    # So need to scale all songs together
    all_tracks = all_rec_tracks.append(all_orig_tracks).reset_index().drop(['index'], axis = 1)
    
    all_tracks_scaled = pd.DataFrame(StandardScaler().fit_transform(all_tracks.drop(['track_id'], axis = 1)))
    
    # Now get all original tracks, compare distance 
    all_rec_tracks_scaled = all_tracks_scaled.iloc[0:len(all_rec_tracks)]
    all_rec_tracks_scaled['track_id'] = all_rec_tracks['track_id']
    all_orig_tracks_scaled = all_tracks_scaled.iloc[len(all_rec_tracks):all_tracks_scaled.shape[0]].reset_index().drop(['index'], axis = 1)
    all_orig_tracks_scaled['track_id'] = all_orig_tracks['track_id']
    
    all_tracks_scaled['track_id'] = all_tracks['track_id']
    
    dist_matrix = pd.DataFrame(distance_matrix(all_rec_tracks_scaled.drop(['track_id'], axis = 1).values, all_orig_tracks_scaled.drop(['track_id'], axis = 1).values))
    
    # Now have a distance matrix whose rows are all of our recommended tracks
    # Columns are all original tracks, and values are the distance between the songs (after all were normalized together)
    # Make the column names and row names the actual track_ids 
    dist_matrix.columns = all_orig_tracks['track_id']
    dist_matrix.insert(0, 'track_id', all_rec_tracks['track_id'])
    
    
    return dist_matrix



# Leverage spotipy here instead, could be more concise

def create_playlist(access_token, user_id, title):
    cp_headers = {'Authorization': access_token, 'Content-Type': 'application/x-www-form-urlencoded'}
    cp_post = {'name': title, 'public': 'true', 'collaborative': 'false',
               'description': 'testing this out'}
    cp_url = 'https://api.spotify.com/v1/users/' + user_id.decode("utf-8") + '/playlists'
    r_cp = requests.post(cp_url, headers=cp_headers, data=json.dumps(cp_post))

    print (r_cp.status_code)

    if str(r_cp.status_code) != '201':
        print (r_cp.json())

        return "It didnt work yo - try again"
    r_cp_json = r_cp.json()
    playlist_id = r_cp_json['id']
    owner_id = r_cp_json['owner']['id']
 
    return  playlist_id #full_pl







In [133]:
scope = 'playlist-modify-public'
client_id ='a63ddab3e3d147898a0df1d5658f9ee9'
client_secret ='226f21e7e723484f909f768fdadada7c'
redirect_uri ='http://localhost/'
user_id = '22r6slwbns4u7hkhn3hjhjhyi'

token = util.prompt_for_user_token(user_id, scope = scope, client_id = client_id, client_secret = client_secret, redirect_uri = redirect_uri)





In [139]:
import time

access_token = 'Bearer ' + token
playlist_name_1 = 'Dad Music'
playlist_name_2 = 'tmp2'

start_time = time.time()

p1_base = Playlist_Base(playlist_name = playlist_name_1, user_id = user_id, access_token = access_token)
#Pear_Playlists(playlist_name_1, playlist_name_2, access_token, user_id)

print('took this long: ', time.time() - start_time)

all initialization stuff:  0.2877519130706787
orig_playlist_tracks_tmp:  0.2104189395904541
orig_playlist_tracks_df:  0.15059709548950195
rec_tracks_tmp:  3.404110908508301
rec_tracks_df:  0.5610520839691162
took this long:  5.019638776779175


In [129]:
def get_recs_from_track_ids (track_ids, access_token):
    api_headers = {'Authorization': access_token}
    api_url_base = 'https://api.spotify.com/v1/recommendations?seed_tracks=' 
    rec_tracks_list = [None]*int((len(playlist_var_track_ids)/5)-1)
    
    # Can only get recs from 5 tracks at a time (get 20 back from API)
    # Store each set of 20 in a list & combine later
    for i in range(0,len(rec_tracks_list)):
        api_url = api_url_base + ','.join(track_ids[i*5:(i+1)*5])
        api_get_response = requests.get(api_url, headers = api_headers )
        tracks_json = api_get_response.json()
        cur_rec_tracks = [None]*20
        for j in range(0,20):
            cur_rec_tracks[j] = track_recs_json['tracks'][j]['id']
        rec_tracks_list[i] = cur_rec_tracks
#        rec_tracks_list[i] = spotipy_connection.recommendations(seed_tracks = playlist_var_track_ids[i*5:(i+1)*5])
        
    # rec_tracks_list is a list of lists, each sublist has a track
    # extract out to a single list
    # be default each recommendation call to API sends back 20 recs
    rec_tracks_list_full = [None]*20*len(rec_tracks_list)

    # Iterate over all sets of recommendations from groups of 5 SpotifyIDs
    for i, rec_set in enumerate(rec_tracks_list):
        # Iterate over every track
        for j, track_id in enumerate(rec_set):
  
            rec_tracks_list_full[i*20 + j] = track_id
            

    all_track_ids = pd.DataFrame(np.array(rec_tracks_list_full))
    all_track_ids.columns=['track_id']


    return all_track_ids
    

In [137]:
tmp = get_similar_tracks_from_playlist(user_id, playlist_dict, access_token)

'Bearer BQDgEBuihlHkyJ_CjjJpnLCKYYeJfVHSPUlNQtA1aKDufVB4dS-qJMJThOHofmK9KaMG0RgswI1CHcbHpoxWb-5s9q2Pzk14EAZO4hVa__XQNLd0HLJrkFRrthXlrcQ-CjfLmO1w8ajn88pMhxrBW0f0LzkLZQet6MEJV1kwVl0IsFUClwMM6so2oOFFCZ2GH2Atx84sid53uOqmjqUWMl_tofM67FFcpMZB-tZjTSkjBeksDVl8xk4p0H33wzZGRu06PGk40fZ-tQ'