# Music Merge 
### Playing with the Spotify API
Using Spotipy wrapper

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import spotipy
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Input
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util
import numpy.random as rand
from scipy import spatial

Using TensorFlow backend.


In [None]:
'''
define the API connection - usually log in from here as well...
'''
token = util.prompt_for_user_token(' ', '')

client_credentials_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


In [7]:
def get_uri(artist):
    results = sp.search(q = "artist:" + artist, type = "artist")
    artist_id = results['artists']['items'][0]['uri'] # changed from '1' to '0' ??
   
    return(artist_id)

def related_artists(name):
    related = sp.artist_related_artists(get_uri(name))
    rels = []
    for artist in related['artists']:
        rels.append(artist['name'])
    return(rels)


def next_artist(artist, artists = []):
    ''' artists is the storage of past people that have been in there'''
    RelatedArtists = related_artists(artist)
    relates = [] # empty list to append with future 
    
    for i in RelatedArtists:
        relates.append(related_artists(i) )
    
    # create a bag of words - to index each artist in:
    
    BOW = np.unique(relates).flatten() 
    vecs = np.zeros((len(relates), len(BOW)))  # unique artists as columns...

    for i in range(len(relates)):
        # turn each into a vector of 0 and 1
        for art in relates[i]:
            # this loops over each artist:
            idx = np.where(BOW == art)[0][0]
            # set that idx number = to 1
            vecs[i,idx] = 1
    
    # then calculate cosine similarity and select the lowest similarity artist and print them
    sims = np.zeros((len(relates),len(relates)))
    sim_means = []
    for i in range(len(vecs)):
        # loop over each vector
        for j in range(len(vecs)):
            # do this again because you want to get similarity between each matrix...
            sims[i,j] = 1 - spatial.distance.cosine(vecs[i],vecs[j])
        # calculate row wise means:
        sim_means.append(np.var(sims[i] ))
    
    # find the lowest scoring similarity artist
    lowest_idx = np.where(sim_means == max(sim_means) )[0][0]      
    return(RelatedArtists[lowest_idx])

## Create function to find x closest artists from their autoencodings

In [11]:
enc = pd.read_csv('401.csv').ix[:,1:]

In [13]:
def artist_path(start_art, target_art, steps):
    
    step_size = 1/steps
    artists = []
    artists.append(start_art)
    
    # need some try - in case the artist name is not valid...
    a1_idx = np.where(enc['Artist'] == start_art)[0][0]
    a2_idx = np.where(enc['Artist'] == target_art)[0][0]
    
    # line equation form:
    # line = np.array(enc.ix[a1_idx,:3]) + t * (np.array(enc.ix[a2_idx,:3] - np.array(enc.ix[a1_idx,:3])) # for some t
    
    for i in range(1,steps+1):
        # 1 -> steps because already dealth with the first point...
        # calculate the target point on line
        point = np.array(enc.ix[a1_idx,:3]) + i*step_size * (np.array(enc.ix[a2_idx,:3] - np.array(enc.ix[a1_idx,:3])) )  
        # find closest point by calculating distances of all artists...
        distances = []
                                                             
                                                            
        for j in range(len(enc)):
            distances.append( spatial.distance.euclidean(enc.ix[j,:3], point) )
        # choose minimum distance vector:
        choice_idx = np.where(distances == np.min(distances) )[0][0]
        
        artists.append(enc['Artist'][choice_idx])
    return(artists)                                                     
                                                                                                             

In [17]:
artist_path('Drake','Calvin Harris', 3)

['Drake', 'X Ambassadors', 'Disclosure', 'Calvin Harris']

## Pull in songs for an artist:
Start with just the top tracks for an artist and an audio analysis...


In [94]:
def get_next_track(artURI, last_track):   
    trackids = [] 
    K = sp.artist_top_tracks(artURI)
    for i in range(10):
        trackids.append(K['tracks'][i]['uri'])
    
    # then do audio analysis to find the best one:
    features = np.zeros((len(trackids),13)) # for storing song features in a grid...
    for track in range(len(trackids)):
        Track = sp.audio_features(trackids[track])
        # features
        feats = ['acousticness','danceability','duration_ms','energy','instrumentalness','key','liveliness',\
                 'loudness','mode','speechiness','tempo','time_signature','valence']
        for i in range(len(feats)):
        features[track, i] = Track[track][feats[i]]   
    # next calculate similarity between all tracks...
    return(trackids)

In [103]:
sp.audio_features('spotify:track:7MXVkk9YMctZqd1Srtv4MB')

[{'acousticness': 0.168,
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/7MXVkk9YMctZqd1Srtv4MB',
  'danceability': 0.675,
  'duration_ms': 230453,
  'energy': 0.595,
  'id': '7MXVkk9YMctZqd1Srtv4MB',
  'instrumentalness': 3.36e-06,
  'key': 7,
  'liveness': 0.136,
  'loudness': -7.033,
  'mode': 1,
  'speechiness': 0.284,
  'tempo': 185.998,
  'time_signature': 4,
  'track_href': 'https://api.spotify.com/v1/tracks/7MXVkk9YMctZqd1Srtv4MB',
  'type': 'audio_features',
  'uri': 'spotify:track:7MXVkk9YMctZqd1Srtv4MB',
  'valence': 0.49}]

### Larger Dataset

In [2]:
pd.read_csv('genres.csv').ix[:,1].tolist()
artists = pd.read_csv('artists.csv',encoding = "ISO-8859-1")

In [None]:
import time

# setup null matrix length of artists by length of genres...
GEN = np.zeros((len(artists),len(genres)))

# run a loop over each artist to get their vector of 
uris = artists['URI']
start_time = time.time()

for i in range(len(uris)):
    if (i % 100) == 0 :
        print(i)
        print('Time passed so far is: ', time.time() - start_time)
        print('-----------------------------------------')
    K = sp.artist(uris[i])['genres']
    for k in range(len(K)):
        idx = np.where(np.array(genres) == K[k])[0][0]
        GEN[i,idx] = 1

In [126]:
# make a csv file of artists and their genre matrix:
GEN2 = pd.DataFrame(GEN)
vecs = pd.concat([artists,GEN2],axis=1).ix[:,3:]

# normalize then autoencode this...
vecs['popularity'] = vecs['popularity']/100
vecs['followers'] = vecs['followers'] / 10000000 

vecs.to_csv('Genre_matrix')

inps = np.array(vecs)

In [None]:
## From 20k now import as  vecs:
vecs = pd.read_csv('raw_data.csv',encoding = "ISO-8859-1")
namezz = []
genres = []
start_time = time.time()

for i in range(80):
    print('next 50...')
    
    LIST = sp.artists(URIs.tolist()[50*i:50*(i+1)] ) ['artists']
    for j in range(50):
        namezz.append(LIST[j]['name'])
        genres.append(LIST[j]['genres'])
        
    print('Time passed so far is: ', time.time() - start_time)
    print('-----------------------------------------')  
    
# create genre vectors:
flatten = lambda l: [item for sublist in l for item in sublist]
genrez = np.sort(np.unique(flatten(genres)) )

### Get genre matrix for each person...
# setup null matrix length of artists by length of genres...
GEN = np.zeros((len(namezz),len(genrez)))

for i in range(len(uris)):
    if i <4000:
        G = genres[i]        
        for k in range(len(G)):
            idx = np.where(genrez == G[k])[0][0]
            GEN[i,idx] = 1
inps = GEN # rename for ez use

In [273]:
''' make the autoencoder'''
inputs = Input(shape=(np.shape(inps)[1],))
fl_dim = 50
encoding_dim = 3
# a layer instance is callable on a tensor, and returns a tensor
ENCOD1 = Dense(fl_dim, activation='relu')(inputs)
ENCOD2 = Dense(encoding_dim, activation='relu')(ENCOD1)
DECOD1 = Dense(fl_dim, activation='relu')(ENCOD2)
DECOD2 = Dense(np.shape(inps)[1], activation='linear')(DECOD1)
#predictions = Dense(10, activation='softmax')(x)

# this creates a model that includes
# the Input layer and three Dense layers
autoencoder = Model(input=inputs, output=DECOD2)

# this model maps an input to its encoded representation
encoder = Model(inputs, output = ENCOD2)

# create a placeholder for an encoded (N-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last slayer of the autoencoder model
#decoder_layer = autoencoder.layers[-1]
# create the decoder model
#decoder = Model(input=encoded_input, output=decoder_layer(encoded_input))

### compile the model...
autoencoder.compile(optimizer='rmsprop', loss='mse')

In [None]:
autoencoder.fit(inps, inps,
                nb_epoch=10,batch_size=10,shuffle=True )

In [213]:
''' save the predictions... '''
encoded_inputs = pd.DataFrame(encoder.predict(inps))

''' add in names and spotify URIs to the df above... '''
encoded_inputs['Artist'] = namezz
encoded_inputs.to_csv('MORE_encoded_artists.csv')
enc = encoded_inputs
enc['URI'] = URIs[:4000]

In [270]:
def artist_path(start_art, target_art, steps):
    
    step_size = 1/steps
    artists = []
    artists.append([start_art,start_art,start_art,start_art,start_art])
    uri = []
    uri.append([enc.loc[enc['Artist']== start_art]['URI'].reset_index(drop=True)[0],\
                enc.loc[enc['Artist']== start_art]['URI'].reset_index(drop=True)[0],\
                enc.loc[enc['Artist']== start_art]['URI'].reset_index(drop=True)[0],\
                enc.loc[enc['Artist']== start_art]['URI'].reset_index(drop=True)[0],\
                enc.loc[enc['Artist']== start_art]['URI'].reset_index(drop=True)[0]])
    
    # need some try - in case the artist name is not valid...
    a1_idx = np.where(np.array(enc['Artist']) == str(start_art))[0][0]
    a2_idx = np.where(np.array(enc['Artist']) == str(target_art))[0][0]
    
    # line equation form:
    # line = np.array(enc.ix[a1_idx,:3]) + t * (np.array(enc.ix[a2_idx,:3] - np.array(enc.ix[a1_idx,:3])) # for some t
    
    for i in range(1,steps+1):
        # 1 -> steps because already dealth with the first point...
        # calculate the target point on line
        point = np.array(enc.ix[a1_idx,:3]) + i*step_size * (np.array(enc.ix[a2_idx,:3] - np.array(enc.ix[a1_idx,:3])) )  
        # find closest point by calculating distances of all artists..
        
        distances = []
                                                             
        # first we can discard artists who have a higher than absolute x% variation on all three metrics
        # this is to speed up the algorithm
        #percen = .5
        
        #enc2 = enc[(enc[0]>point[0]*(1-percen)) & (enc[0]<point[0]*(1+percen))]
        
        for j in range(len(enc)):
            distances.append( spatial.distance.euclidean(enc.ix[j,:3], point) ) # can also try euclidean dist...
        # choose minimum 3 distance vector:
        choice_idx = []
        
        ord_dist = np.sort(distances)
        first = ord_dist[0]
        second = ord_dist[1]
        third = ord_dist[2]
        fourth = ord_dist[3]
        fifth = ord_dist[4]
        
        choice_idx.append(np.where(distances == first )[0][0])
        choice_idx.append(np.where(distances == second )[0][0])
        choice_idx.append(np.where(distances == third )[0][0])
        choice_idx.append(np.where(distances == fourth )[0][0])
        choice_idx.append(np.where(distances == fifth )[0][0])
        
        print(choice_idx)
        artists.append(enc['Artist'][choice_idx].tolist())
        uri.append(enc['URI'][choice_idx].tolist())
    return(artists, uri)                                                     

In [185]:
enc['URI'][[963, 3191, 762]].tolist()

['spotify:artist:1tqZaCwM57UFKjWoYwMLrw',
 'spotify:artist:6BMhCQJYHxxKAeqYS1p5rY',
 'spotify:artist:1UdQqCUR7RwB9YYJONwbdM']

In [263]:
artist_path('Alesso','Tory Lanez', 3)

[2176, 429, 491, 491, 2485]
[3594, 3853, 3313, 2741, 3168]
[1427, 405, 535, 2786, 2308]


([['Alesso', 'Alesso', 'Alesso'],
  ['Luke James', 'Kwabs', 'Stacy Barthe', 'Stacy Barthe', 'Tank'],
  ['Beyoncé',
   "Colby O'Donis",
   'Amy Winehouse',
   'Kanye West',
   'Far East Movement'],
  ['Tory Lanez', 'Greg Brown', 'A$AP Rocky', 'Lifehouse', 'R. City']],
 [['spotify:artist:4AVFqumd2ogHFlRbKIjp1t',
   'spotify:artist:4AVFqumd2ogHFlRbKIjp1t',
   'spotify:artist:4AVFqumd2ogHFlRbKIjp1t'],
  ['spotify:artist:4E7AV8mtElSjHZP3xA9kyU',
   'spotify:artist:0r0KdmVS1Er3kaFnl1KPog',
   'spotify:artist:0yq6uHIfFks9yOURUuCITV',
   'spotify:artist:0yq6uHIfFks9yOURUuCITV',
   'spotify:artist:4mwXUEKaW4ftbncf9Hi58l'],
  ['spotify:artist:6vWDO969PvNqNYHIOW5v0m',
   'spotify:artist:7fObcBw9VM3x7ntWKCYl0z',
   'spotify:artist:6Q192DXotxtaysaqNPy5yR',
   'spotify:artist:5K4W6rqBFWDnAN6FQUkS6x',
   'spotify:artist:698hF4vcwHwPy8ltmXermq'],
  ['spotify:artist:2jku7tDXc6XoB6MO2hFuqg',
   'spotify:artist:0nnDCl6emTFoWtygqSs4Jy',
   'spotify:artist:13ubrt8QOOCPljQ2FL1Kca',
   'spotify:artist:5PokPZ

In [260]:
def get_next_track(artURI, last_track):   
    trackids = [] 
    K = sp.artist_top_tracks(artURI)['tracks']
    for i in range(len(K)):
        trackids.append(K[i]['uri'])
    
    # then do audio analysis to find the best one:
    features = np.zeros((len(trackids),13)) # for storing song features in a grid...
    Track = sp.audio_features(trackids)
    for track in range(len(trackids)):
        feats = ['acousticness','danceability','duration_ms','energy','instrumentalness','key','liveliness',\
                 'loudness','mode','speechiness','tempo','time_signature','valence']
        for i in range(len(feats)):
        features[track, i] = Track[track][feats[i]] 
        
    # next calculate similarity between all tracks...
    ''' for each track calculate similarity between last track '''
    ################################
    # get last track features:
    Track = sp.audio_features(last_track)
    last_trk_feat = []
    last_trk_feat.append(Track[0]['acousticness'])
    last_trk_feat.append(Track[0]['danceability'])
    last_trk_feat.append(Track[0]['duration_ms'])
    last_trk_feat.append(Track[0]['energy'])
    last_trk_feat.append(Track[0]['instrumentalness'])
    last_trk_feat.append(Track[0]['key'])
    last_trk_feat.append(Track[0]['liveness'])
    last_trk_feat.append(Track[0]['loudness'])
    last_trk_feat.append(Track[0]['mode'])
    last_trk_feat.append(Track[0]['speechiness'])
    last_trk_feat.append(Track[0]['tempo'])
    last_trk_feat.append(Track[0]['time_signature'])
    last_trk_feat.append(Track[0]['valence'])
     ##########################3
        
    similarity = []
    for i in range(len(features)):
        # cosine sim between last track vector and each track features...
        similarity.append(spatial.distance.euclidean( features[i] , last_trk_feat  ) ) # try euclidean distance
        
    if np.min(similarity) == 0:
        # find maximum similarity track ID
        print('second =: ', sorted(similarity)[1])
        second = sorted(similarity)[1] #[len(similarity)-2] # second SMALLEST number
    
        # then we have the same song...
        idx = np.where(similarity == second)[0][0]
        print('index = ',idx)
    
    else:
        idx = np.where(similarity == np.min(similarity) )[0][0]
        print('index =', idx)
    
    #then use the index to get which track:
    track_name = sp.track(trackids[idx])['name']
    
    return(track_name,trackids[idx], similarity[idx] )

In [247]:
def get_first_track(art1, art2):   
    trackids1 = [] 
    trackids2 = [] 
    K = sp.artist_top_tracks(art1)
    for i in range(5):
        trackids1.append(K['tracks'][i]['uri'])
        
    J = sp.artist_top_tracks(art2)
    for i in range(5):
        trackids2.append(J['tracks'][i]['uri'])
    
    # then do audio analysis to find the best one:
    features1 = np.zeros((len(trackids1),13)) # for storing song features in a grid...
    Track1 = sp.audio_features(trackids1)
    for track in range(len(trackids1)):
        feats = ['acousticness','danceability','duration_ms','energy','instrumentalness','key','liveliness',\
                 'loudness','mode','speechiness','tempo','time_signature','valence']
        for i in range(len(feats)):
        features[track, i] = Track[track][feats[i]] 

    features2 = np.zeros((len(trackids2),13)) # for storing song features in a grid...
    Track2 = sp.audio_features(trackids2)
    for track in range(len(trackids2)):
        feats = ['acousticness','danceability','duration_ms','energy','instrumentalness','key','liveliness',\
                 'loudness','mode','speechiness','tempo','time_signature','valence']
        for i in range(len(feats)):
        features2[track, i] = Track[track][feats[i]] 
        
    similarity = []
    for i in range(len(features)):
        # cosine sim between last track vector and each track features...
        
        similarity.append(1 - spatial.distance.cosine( features[i] , last_trk_feat  ) )
    
    # find maximum similarity track ID
    second = sorted(similarity)[len(similarity)-2] # second largest nubmer
    
    if np.max(similarity) > 0.999999999:
        # then we have the same song...
        idx = np.where(similarity == second)[0][0]
    
    else:
        idx = np.where(similarity == np.max(similarity) )[0][0]
    
    #then use the index to get which track:
    track_name = sp.track(trackids[idx])['name']
    
    return(track_name,trackids[idx], similarity[idx] )

In [261]:
# start with an artist and a target artist - get their path and the steps... then iteratively get the next track...
# get a random first track though of artist 1

def music_merge(a1, a2, steps):
    
    # get artist path:
    namez , uriz = artist_path(a1,a2, steps)
    
    tracks = []
    track_names = []
    
    # for the first artist randomly choose first track:
    rand_ix = rand.randint(0,3) # from top 10 tracks
    
    # need to find the top x of their traks (some less than 5)
    
    first_track = sp.artist_top_tracks(uriz[0][0])['tracks'][rand_ix]['uri']
    first_track_name = sp.artist_top_tracks(uriz[0][0])['tracks'][rand_ix]['name']
    tracks.append(first_track)
    track_names.append(first_track_name)
    
    art_choice = 0 # initialize it
    
    for i in range(1,steps+1):
        print(i)
        # get next track...
        
        # 5 candidates artists per track:
        
        cand_name = []
        cand_ID = []
        cand_sim = []
        
        if( i != steps):
            for a in range(len(namez[0])):
                next_track_name, next_trackID, next_track_sim = get_next_track( uriz[i][a],tracks[i-1])
                cand_name.append(next_track_name)
                cand_ID.append(next_trackID)
                cand_sim.append(next_track_sim)
            print(cand_sim)    
            best_track_idx = np.where(cand_sim == np.min(cand_sim))[0]
            if len(best_track_idx)>1:
                print('best track is tied -- choosing the first track')
                best_track_idx = best_track_idx[0]
            print(best_track_idx)
        # pick which artist:
            art_choice = best_track_idx
        
            tracks.append(cand_ID[best_track_idx])
            track_names.append(cand_name[best_track_idx])
        else:
            # last iteration:
            next_track_name, next_trackID, next_track_sim = get_next_track( uriz[i][0],tracks[i-1])
            tracks.append(next_trackID)
            track_names.append(next_track_name)
            
    return(tracks, track_names,namez)
    

In [272]:
music_merge('Joe Pass','Johnny Cash',6)

[985, 1512, 1512, 138, 1732]
[1379, 3662, 2375, 888, 259]
[1736, 3941, 1586, 2941, 2293]
[1125, 1108, 3179, 3731, 858]
[2979, 1108, 3493, 3579, 3579]
[3493, 3434, 3579, 3579, 2979]
1
index = 1
index = 6
index = 6
index = 5
index = 2
[10653.010097454642, 17266.007677270423, 17266.007677270423, 8266.126384294823, 5199.0133309283165]
[4]
2




index = 7
index = 0
index = 6
index = 0
index = 8
[2904.2593898678024, 11906.056428148822, 3187.0057284594795, 819.0110346785652, 3613.388999909364]
[3]
3
index = 7
index = 4
index = 8
index = 6
index = 4
[18248.000729148913, 2328.3290904928494, 4941.0213942407745, 8355.001343239488, 1101.7357881276523]
[4]
4
index = 2
index = 8
index = 3
index = 9
index = 7
[6464.065334091579, 18438.031081416513, 2092.0220741148623, 11799.097962678514, 3345.1741120640736]
[2]
5
index = 5
index = 8
index = 8
index = 4
index = 4
[4067.4241784840433, 16346.04529308138, 10573.1902046574, 3627.5655958431757, 3627.5655958431757]
best track is tied -- choosing the first track
3
6
index = 8


(['spotify:track:3t7aNy8nHeWkaY4SAncR7n',
  'spotify:track:7cKQE2K6ggLByzlIMtCxzu',
  'spotify:track:1IWv7yYMFplrKaZDUldFJn',
  'spotify:track:2f9h72MZuU0ZdEnGIfGf5H',
  'spotify:track:2J3n32GeLmMjwuAzyhcSNe',
  'spotify:track:4wNFFRLo7EnEdbf2eoFLEU',
  'spotify:track:4yEvfGgJ9tYfyfXXMLza1V'],
 ['A Time for Us',
  'Bottled Up Tight',
  "Wasn't Expecting That",
  'Here We Go',
  'Say It, Just Say It',
  'Reasons To Love You',
  'She Used to Love Me a Lot'],
 [['Joe Pass', 'Joe Pass', 'Joe Pass', 'Joe Pass', 'Joe Pass'],
  ['Wisin & Yandel',
   'Lucky Luciano',
   'Lucky Luciano',
   'Starsailor',
   'Luke Sital-Singh'],
  ['Staysman & Lazz', 'Tom Zanetti', 'Foy Vance', 'Jamie Lawson', 'Roo Panes'],
  ['Joe Purdy',
   'Wrabel',
   'Noah Gundersen',
   'Pete Yorn',
   'Drew Holcomb & The Neighbors'],
  ['Corinne Bailey Rae',
   'Lane 8',
   "The Mowgli's",
   'Benjamin Francis Leftwich',
   'Matt Simons'],
  ['Priscilla Ahn', 'Lane 8', 'Johnny Cash', 'Meiko', 'Meiko'],
  ['Johnny Cash', '