In [2]:
# provided list - m songs ranked from best to worst in terms of relevance
# old output list - top n songs
# new output list - make minimum number of replacements to top n in order to satisfy diversity score

In [1]:
import numpy as np
import random
import csv
import pandas as pd

In [4]:
# key parameters

dimensionality = 50
m = 40
n = 10
min_diversity_score = 0.5

In [5]:
# generate dummy data

ideal_vector = np.random.uniform(-10,10,size=dimensionality)

provided_list = []

for i in range(m):
    provided_list.append(np.random.uniform(-10,10,size=dimensionality))
    
# sort provided_list from most to least relevant

dists = []
for vector in provided_list:
    dists.append(np.linalg.norm(ideal_vector - vector))
    
provided_list = [x for _,x in sorted(zip(dists, provided_list))]
    

In [6]:
# extract set of 10000 real song vectors, name, artist & genre into 2d list

song_id_vector_dict = np.load('data/song_embeddings.npy').item()
vector_song_id_dict = dict((str(v),k) for k,v in song_id_vector_dict.items())

In [7]:
song_vector_list = list(song_id_vector_dict.values())

In [9]:
# randomly select 10000 song_ids from the dict

d = list(song_id_vector_dict.keys()) 

song_id_list = random.sample(list(d), 10000)

# create 2d array where each entry of outer array is length 4 list of song_id, title, artist, genre, embedding list

df = pd.read_csv('data/full_msd.csv') 


In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,artist_mbid,artist_name,artist_playmeid,danceability,duration,energy,key,loudness,mode,release,release_7digitalid,song_hotttnesss,song_id,tempo,time_signature,title,track_7digitalid,track_id,year
0,0,b'bc30924c-1900-41c3-9a97-41a93433836f',b'Baka Beyond',-1,0.0,368.14322,0.0,9,-11.482,0,b'The Meeting Pool',38594,0.355286,b'SOXFJZQ12A6D4F81FB',139.092,4,b'Journey (Album Version)',412977,b'TRCCCRP128F147BEFA',1995
1,1,b'4cda00d8-55d3-42cd-b5e3-5e1934406eaa',b'The Fleetwoods',3031,0.0,175.59465,0.0,5,-15.193,1,b'Come Softly To Me: The Very Best Of The Flee...,160575,0.310792,b'SOMSPXD12A8C139F36',87.65,4,b'Ten Times Blue',1694460,b'TRCCCDI128F427CC1F',1993
2,2,b'29762c82-bb92-4acd-b1fb-09cc4da250d2',b'Joe Satriani',8426,0.0,254.85016,0.0,7,-4.707,1,b'Super Colossal',308005,0.693272,b'SOSIYAD12A8C14097F',87.875,4,"b""It's So Good""",3473087,b'TRCCCMQ128F42AE752',2006
3,3,b'913096b7-10dc-41ca-9777-f299df45e9e2',b'BIZ',59280,0.0,233.40363,0.0,9,-5.972,1,b'Long Way Down Single',195821,,b'SODXCXN12A8C135FDD',97.997,4,b'Ghosts',2130366,b'TRCCCKF128F424C3D3',0
4,4,b'e4cfb284-9ab1-47f0-a725-082f84275a16',b'Memphis Slim',19214,0.0,110.62812,0.0,5,-20.1,1,b'Essential Blues Grooves Vol. 1',599348,,b'SODKHJS12AB0183BF0',200.962,1,b'Baby Please Come Home',6650421,b'TRCCCFH12903CEBC70',0


In [11]:
def prepare_song_id(song_id):
    return ('b\'' + song_id + '\'')


In [12]:
print(df.loc[(df['song_id'] == prepare_song_id(song_id_list[1]))])


        Unnamed: 0                              artist_mbid artist_name  \
766409      766409  b'633c13a3-335a-466f-8e01-535837faeff5'      b'Ayo'   

        artist_playmeid  danceability   duration  energy  key  loudness  mode  \
766409               -1           0.0  165.53751     0.0    9    -8.931     0   

                   release  release_7digitalid  song_hotttnesss  \
766409  b'Gravity At Last'              393852         0.728083   

                      song_id    tempo  time_signature                 title  \
766409  b'SOVIHIT12AB0180ECD'  127.398               4  b'Get Out Of My Way'   

        track_7digitalid               track_id  year  
766409           4369491  b'TRSXWHS128F92F732F'  2008  


In [13]:
# calculate diversity of top n vectors

def get_diversity(song_vector_list):
    
    sim = 0
    
    n = len(song_vector_list)
    
    for i in range(n):
        for j in range(n):
            sim += 1 - np.linalg.norm(song_vector_list[i]-song_vector_list[j])
            
    return sim/((n/2)*(n-1))


In [25]:
provided_list = random.sample(song_vector_list, 100)
get_diversity(provided_list)

1.925528494977808

In [19]:
song_id = vector_song_id_dict[str(song_vector_list[0])]
print(song_id)

SOAKIMP12A8C130995


In [None]:
# swap until diversity of top n vectors > min_diversity_score

ideal_vector = np.random.uniform(-10,10,size=dimensionality)


In [5]:
# test out diversity levels

recommendations = np.load('data/recommendations.npy')

print(recommendations.shape)

print(recommendations[0])

(1000, 100, 2)
[['SOBONKR12A58A7A7E0' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOAUWYT12A81C206F1' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOPUCYA12A8C13A694' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOFLJQZ12A6D4FADA6' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOAXGDH12A8C13F8A1' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOHTKMO12AB01843B0' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOSXLTC12AF72A7F54' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOFRQTD12A81C233C0' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOVDSJC12A58A7A271' '00000b722001882066dff9d2da8a775658053ea0']
 ['SORJICW12A8C13640D' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOUFTBI12AB0183F65' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOBOUPA12A6D4F81F1' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOVWBYM12A6D4F8A22' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOUNZHU12A8AE47481' '00000b722001882066dff9d2da8a775658053ea0']
 ['SOZVVRE12A8C143150' '00000b722001882066dff9d