In [12]:
import pandas as pd
import numpy as np
from os.path import exists
from tqdm import tqdm
from numba import jit
import re

# Data

In [2]:
tf_idf = pd.read_table("./../MMSR_WT22_Task1_Data/id_lyrics_tf-idf_mmsr.tsv", index_col="id")
tf_idf.head()

Unnamed: 0_level_0,abl,accept,across,act,addict,afraid,age,ago,ah,ahead,...,yea,yeah,year,yellow,yes,yesterday,yet,yo,young,youth
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9jbSytob9XRzwvB6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150511
Njp6JPM8vitbhVJU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
h48f46ZsT9h0Z5Dm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.149783,0.0,0.0,0.0,0.0
ZmXVK43zlqdeq6z8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PV5EXN6AIVBqvsLO,0.0,0.0,0.0,0.0,0.0,0.327025,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Other Feature vectors to use
word2vec = pd.read_table("./../MMSR_WT22_Task1_Data/id_lyrics_word2vec_mmsr.tsv", index_col='id')
bert = pd.read_table("./../MMSR_WT22_Task1_Data/id_bert_mmsr.tsv", index_col='id')

In [4]:
genres = pd.read_table("./../MMSR_WT22_Task1_Data/id_genres_mmsr.tsv", index_col="id")
genres.head()

Unnamed: 0_level_0,genre
id,Unnamed: 1_level_1
0009fFIM1eYThaPg,['pop']
0010xmHR6UICBOYT,"['beats', 'underground hip hop', 'lo fi']"
002Jyd0vN4HyCpqL,"['hard rock', 'classic rock', 'rock', 'progres..."
006TYKNjNxWjfKjy,"['power metal', 'symphonic metal', 'symphonic ..."
007LIJOPQ4Sb98qV,"['post punk', 'new wave', 'dream pop', 'altern..."


In [5]:
info = pd.read_table("./../MMSR_WT22_Task1_Data/id_information_mmsr.tsv", index_col="id")
info.head()

Unnamed: 0_level_0,artist,song,album_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0009fFIM1eYThaPg,Cheryl,Rain on Me,3 Words
0010xmHR6UICBOYT,Oddisee,After Thoughts,The Beauty in All
002Jyd0vN4HyCpqL,Blue Öyster Cult,ME 262,Secret Treaties
006TYKNjNxWjfKjy,Rhapsody,Flames of Revenge,Legendary Years (Re-Recorded)
007LIJOPQ4Sb98qV,The Chameleons,Nostalgia,What Does Anything Mean? Basically (2009 Remas...


In [6]:
# Check if there is any null value in tf_idf values
tf_idf.isnull().sum().sum()

0

# Similarity functions

In [15]:
@jit(nopython=True)
def cosine_similarity(d1, d2):
    divisor = np.linalg.norm(d1) * np.linalg.norm(d2)
    if divisor == 0:
        return 0
    return (d1 @ d2) / divisor

In [16]:
@jit(nopython=True)
def inner_product(d1, d2):
    return (d1 @ d2) 

In [17]:
# I am not sure if it is correct
# Richi: should be fine
@jit(nopython=True)
def jaccard_formulation(d1, d2):
    divisor = np.linalg.norm(d1) + np.linalg.norm(d2) - (d1 @ d2)
    if divisor == 0:
        return 0
    return (d1 @ d2) / divisor

In [18]:
# test_vectors
a, b = tf_idf.iloc[0].values, tf_idf.iloc[2].values
print(cosine_similarity(np.array(a), np.array(b)),jaccard_formulation(np.array(a), np.array(b)))

0.08598512652034522 0.04492395942776836


# Storage for computed data

In [19]:
# Matrix to store the values
# cosine_sim = np.zeros((len(tf_idf.index),len(tf_idf.index)))

# cosine_sim = np.zeros((len(tf_idf.index),len(tf_idf.index)))
# print(cosine_sim.shape) # (76115, 76115) 

# To calculate the similarity between all songs
# for row, id1 in enumerate(tf_idf.index):
#     for col,id2 in enumerate(tf_idf.index):
#         # print(row,col)
#         cosine_sim[row,col] = cosine_similarity(tf_idf.loc[id1].values, tf_idf.loc[id2].values)

In [20]:
# The index depends in the features vector, so it is better to assign 
# it depending on which feature vector we are using

# change to bert.index or word2vec.index
index_values = tf_idf.index

if exists('cosine_distances_tfidf.csv'):
    df_cosineDistance_tfidf = pd.read_csv("cosine_distances_tfidf.csv", index_col="id")
else:
    df_cosineDistance_tfidf = pd.DataFrame(index=index_values)
    
if exists('innerProduct_distances_tfidf.csv'):
    df_innerProductDistance_tfidf = pd.read_csv("innerProduct_distances_tfidf.csv", index_col="id")
else:
    df_innerProductDistance_tfidf = pd.DataFrame(index=index_values)
    
if exists('jaccard_distances_tfidf.csv'):
    df_jaccardDistance_tfidf = pd.read_csv("jaccard_distances_tfidf.csv", index_col="id")
else:
    df_jaccardDistance_tfidf = pd.DataFrame(index=index_values)
    
    
index_values = word2vec.index

if exists('cosine_distances_word2vec.csv'):
    df_cosineDistance_word2vec = pd.read_csv("cosine_distances_word2vec.csv", index_col="id")
else:
    df_cosineDistance_word2vec = pd.DataFrame(index=index_values)
    
if exists('innerProduct_distances_word2vec.csv'):
    df_innerProductDistance_word2vec = pd.read_csv("innerProduct_distances_word2vec.csv", index_col="id")
else:
    df_innerProductDistance_word2vec = pd.DataFrame(index=index_values)
    
if exists('jaccard_distances_word2vec.csv'):
    df_jaccardDistance_word2vec = pd.read_csv("jaccard_distances_word2vec.csv", index_col="id")
else:
    df_jaccardDistance_word2vec = pd.DataFrame(index=index_values)
    
    
index_values = bert.index

if exists('cosine_distances_bert.csv'):
    df_cosineDistance_bert = pd.read_csv("cosine_distances_bert.csv", index_col="id")
else:
    df_cosineDistance_bert = pd.DataFrame(index=index_values)
    
if exists('innerProduct_distances_bert.csv'):
    df_innerProductDistance_bert = pd.read_csv("innerProduct_distances_bert.csv", index_col="id")
else:
    df_innerProductDistance_bert = pd.DataFrame(index=index_values)
    
if exists('jaccard_distances_bert.csv'):
    df_jaccardDistance_bert = pd.read_csv("jaccard_distances_bert.csv", index_col="id")
else:
    df_jaccardDistance_bert = pd.DataFrame(index=index_values)

In [21]:
def saveDataToFile():
    # TFIDF
    df_cosineDistance_tfidf.to_csv('cosine_distances_tfidf.csv',sep=',')
    df_innerProductDistance_tfidf.to_csv('innerProduct_distances_tfidf.csv',sep=',')
    df_jaccardDistance_tfidf.to_csv('jaccard_distances_tfidf.csv',sep=',')
    
    # WORD2VEC
    df_cosineDistance_word2vec.to_csv('cosine_distances_word2vec.csv',sep=',')
    df_innerProductDistance_word2vec.to_csv('innerProduct_distances_word2vec.csv',sep=',')
    df_jaccardDistance_word2vec.to_csv('jaccard_distances_word2vec.csv',sep=',')

    # BERT
    df_cosineDistance_bert.to_csv('cosine_distances_bert.csv',sep=',')
    df_innerProductDistance_bert.to_csv('innerProduct_distances_bert.csv',sep=',')
    df_jaccardDistance_bert.to_csv('jaccard_distances_bert.csv',sep=',')

# Query for song ID

In [22]:
def getSongIdByQuery(query):
    artist, track =query.split(',')
    id_ = info[(info['artist'] == artist) & (info['song'] == track)].index.values[0]
    return id_

lets Test it if the Query works correct:

In [23]:
# User query "The Chameleons, Nostalgia"
info[(info['artist'] == 'The Chameleons') & (info['song'] == 'Nostalgia')]

Unnamed: 0_level_0,artist,song,album_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
007LIJOPQ4Sb98qV,The Chameleons,Nostalgia,What Does Anything Mean? Basically (2009 Remas...


In [24]:
# artist,song
id_song = getSongIdByQuery("The Chameleons,Nostalgia")
print(id_song)

007LIJOPQ4Sb98qV


# Distance computation

In [25]:
# def distanceToSongs(idSong, similarity_function, df, features_vector):
#     if idSong in df.columns.values:
#         print("Already in data")
#     else:
#         songs = features_vector.index.values
#         distances = [similarity_function(features_vector.loc[idSong], features_vector.loc[song]) for index,song in enumerate(songs)]
#         df[idSong]  = distances 

In [31]:
# this is much faster with jit
@jit(nopython=True)
def distance(v:np.array, df:np.array, songs:list, similarity_function):
    return [similarity_function(v, df[index]) for index,song in enumerate(songs)]

def distanceToSongs(idSong, similarity_function, df, features_vector):
    if idSong in df.columns.values:
        print("Already in data")
    else:
        songs = np.array(features_vector.index.tolist())
        v = np.array(features_vector.loc[idSong]) 
        distances = distance(v, np.array(features_vector), songs, similarity_function)
        #distances = [similarity_function(v, np.array(features_vector.loc[song])) for index,song in enumerate(songs)]
        df[idSong]  = distances 

In [32]:
distanceToSongs(id_song, cosine_similarity, df_cosineDistance_tfidf, tf_idf)

Already in data


In [33]:
df_cosineDistance_tfidf.head()

Unnamed: 0_level_0,007LIJOPQ4Sb98qV,0009fFIM1eYThaPg,wdAhzJrYFsHfCyCl,9jbSytob9XRzwvB6,Njp6JPM8vitbhVJU,h48f46ZsT9h0Z5Dm,ZmXVK43zlqdeq6z8
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9jbSytob9XRzwvB6,0.048452,0.006539,0.051474,1.0,0.097104,0.085985,0.128681
Njp6JPM8vitbhVJU,0.001441,0.013563,0.009384,0.097104,1.0,0.024038,0.010407
h48f46ZsT9h0Z5Dm,0.102718,0.015595,0.040152,0.085985,0.024038,1.0,0.081802
ZmXVK43zlqdeq6z8,0.098409,0.015704,0.028695,0.128681,0.010407,0.081802,1.0
PV5EXN6AIVBqvsLO,0.013196,0.017603,0.015591,0.078034,0.01583,0.048689,0.124416


We can compute more distances if neccesary:

In [34]:
distanceToSongs(getSongIdByQuery("Cheryl,Rain on Me"), cosine_similarity, df_cosineDistance_tfidf, tf_idf)
distanceToSongs(getSongIdByQuery("Doda,Riotka"), cosine_similarity, df_cosineDistance_tfidf, tf_idf)

Already in data
Already in data


In [35]:
# Distances with innerProduct
distanceToSongs(getSongIdByQuery("The Chameleons,Nostalgia"), inner_product, df_innerProductDistance_tfidf, tf_idf)
distanceToSongs(getSongIdByQuery("Cheryl,Rain on Me"), inner_product, df_innerProductDistance_tfidf, tf_idf)
distanceToSongs(getSongIdByQuery("Doda,Riotka"), inner_product, df_innerProductDistance_tfidf, tf_idf)

Already in data
Already in data
Already in data


In [36]:
# Distances with jaccard similarity
distanceToSongs(getSongIdByQuery("The Chameleons,Nostalgia"), jaccard_formulation, df_jaccardDistance_tfidf, tf_idf)
distanceToSongs(getSongIdByQuery("Cheryl,Rain on Me"), jaccard_formulation, df_jaccardDistance_tfidf, tf_idf)
distanceToSongs(getSongIdByQuery("Doda,Riotka"), jaccard_formulation, df_jaccardDistance_tfidf, tf_idf)

Already in data
Already in data
Already in data


And more with different feature vectors:

In [37]:
distanceToSongs(getSongIdByQuery("The Chameleons,Nostalgia"), cosine_similarity, df_cosineDistance_word2vec, word2vec)
distanceToSongs(getSongIdByQuery("Cheryl,Rain on Me"), cosine_similarity, df_cosineDistance_word2vec, word2vec)
distanceToSongs(getSongIdByQuery("Doda,Riotka"), cosine_similarity, df_cosineDistance_word2vec, word2vec)

Already in data
Already in data
Already in data


In [38]:
distanceToSongs(getSongIdByQuery("The Chameleons,Nostalgia"), cosine_similarity, df_cosineDistance_bert, bert)
distanceToSongs(getSongIdByQuery("Cheryl,Rain on Me"), cosine_similarity, df_cosineDistance_bert, bert)
distanceToSongs(getSongIdByQuery("Doda,Riotka"), cosine_similarity, df_cosineDistance_bert, bert)

Already in data
Already in data
Already in data


## The distance is the same with the inner product and cosine similarities because vectors are normalized

In [39]:
df_cosineDistance_tfidf

Unnamed: 0_level_0,007LIJOPQ4Sb98qV,0009fFIM1eYThaPg,wdAhzJrYFsHfCyCl,9jbSytob9XRzwvB6,Njp6JPM8vitbhVJU,h48f46ZsT9h0Z5Dm,ZmXVK43zlqdeq6z8
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9jbSytob9XRzwvB6,0.048452,0.006539,0.051474,1.000000,0.097104,0.085985,0.128681
Njp6JPM8vitbhVJU,0.001441,0.013563,0.009384,0.097104,1.000000,0.024038,0.010407
h48f46ZsT9h0Z5Dm,0.102718,0.015595,0.040152,0.085985,0.024038,1.000000,0.081802
ZmXVK43zlqdeq6z8,0.098409,0.015704,0.028695,0.128681,0.010407,0.081802,1.000000
PV5EXN6AIVBqvsLO,0.013196,0.017603,0.015591,0.078034,0.015830,0.048689,0.124416
...,...,...,...,...,...,...,...
R4VMVAxVOAEWBjgg,0.021491,0.214575,0.012384,0.205137,0.017796,0.013691,0.058160
YgII1tHAaAnh14Kf,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
76qenAf8gYlH4pfq,0.022365,0.039680,0.006073,0.063405,0.011773,0.056524,0.057915
9F8jQjeibAuZinEP,0.027799,0.020988,0.036169,0.108322,0.053566,0.079405,0.134340


In [40]:
df_innerProductDistance_tfidf

Unnamed: 0_level_0,007LIJOPQ4Sb98qV,0009fFIM1eYThaPg,wdAhzJrYFsHfCyCl
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9jbSytob9XRzwvB6,0.048452,0.006539,0.051474
Njp6JPM8vitbhVJU,0.001441,0.013563,0.009384
h48f46ZsT9h0Z5Dm,0.102718,0.015595,0.040152
ZmXVK43zlqdeq6z8,0.098409,0.015704,0.028695
PV5EXN6AIVBqvsLO,0.013196,0.017603,0.015591
...,...,...,...
R4VMVAxVOAEWBjgg,0.021491,0.214575,0.012384
YgII1tHAaAnh14Kf,0.000000,0.000000,0.000000
76qenAf8gYlH4pfq,0.022365,0.039680,0.006073
9F8jQjeibAuZinEP,0.027799,0.020988,0.036169


# Get Top Values

In [41]:
# Set values to None to displau all the data
pd.options.display.max_colwidth = None
pd.options.display.max_columns = None

In [42]:
def getTopValues(idSong, df_metricUsed):
    top_values = df_metricUsed[idSong].sort_values(ascending=False)
    return genres.loc[top_values.index].join(info, on="id", how="left")

We can get now our top recommendations to a song:

In [43]:
print(info.loc[id_song])
getTopValues(id_song, df_cosineDistance_tfidf).head(11)

artist                                            The Chameleons
song                                                   Nostalgia
album_name    What Does Anything Mean? Basically (2009 Remaster)
Name: 007LIJOPQ4Sb98qV, dtype: object


Unnamed: 0_level_0,genre,artist,song,album_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
007LIJOPQ4Sb98qV,"['post punk', 'new wave', 'dream pop', 'alternative rock', 'madchester', 'rock']",The Chameleons,Nostalgia,What Does Anything Mean? Basically (2009 Remaster)
fqotTYeTLpnRj3lF,"['pop', 'teen pop', 'rock', 'pop rock', 'disney', 'hip hop', 'grunge', 'alternative rock', 'rap', 'power pop', 'singer songwriter', 'synthpop']",Hilary Duff,So Yesterday,Metamorphosis
E7t3Y9cW86HLOFZp,"['electronica', 'electro', 'disco', 'funk', 'house', 'indietronica', 'dance rock', 'rock', 'electropop', 'indie rock', 'indie pop', 'electro house', 'space rock', 'pop', 'techno', 'beats', 'disco house', 'indie electronica', 'dance punk', 'background music']",Midnight Juggernauts,Shadows,Shadows
wdAhzJrYFsHfCyCl,"['electropop', 'pop', 'synthpop']",Doda,Riotka,Riotka
wbUm7Kdu4u7CSG39,"['disco', 'singer songwriter', 'classic soul']",Grace Jones,Tomorrow,Portfolio
8crboBOJbzovC9SC,['psychedelic rock'],Ulver,Where Is Yesterday,Childhood's End
gcFfKAX8vSRWOtqk,"['psychedelic rock', 'experimental', 'rock', 'art rock', 'classic rock', 'baroque', 'choral', 'baroque pop', 'psychedelic pop', 'gregorian chant']",The United States of America,Where Is Yesterday,The United States Of America
ExbzgBOxVe8jTGKx,"['pop', 'singer songwriter', 'rumba']",Colbie Caillat,Like Yesterday,All Of You
qSEy5OiW84jPqQb5,"['classic rock', 'rock', 'pop rock', 'pop', 'soft rock', 'hard rock', 'progressive rock', 'new wave', 'easy listening', 'metal', 'alternative rock', 'singer songwriter', 'album rock', 'glam metal']",Foreigner,That Was Yesterday,Agent Provocateur
WWvFN9dBhrf255ar,"['rock', 'alternative rock', 'polish rock', 'punk', 'blues rock', 'ska']",happysad,Taką wodą być,Mów mi dobrze


# Compare Results
We can compare now the top results and check the differences.

## 1. Different similarity functions on tf-idf vectors

In [44]:
# get a song ID from a query
id_song = getSongIdByQuery("The Chameleons,Nostalgia")

In [45]:
# show result on cosine similarity
getTopValues(id_song, df_cosineDistance_tfidf).head(11)

Unnamed: 0_level_0,genre,artist,song,album_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
007LIJOPQ4Sb98qV,"['post punk', 'new wave', 'dream pop', 'alternative rock', 'madchester', 'rock']",The Chameleons,Nostalgia,What Does Anything Mean? Basically (2009 Remaster)
fqotTYeTLpnRj3lF,"['pop', 'teen pop', 'rock', 'pop rock', 'disney', 'hip hop', 'grunge', 'alternative rock', 'rap', 'power pop', 'singer songwriter', 'synthpop']",Hilary Duff,So Yesterday,Metamorphosis
E7t3Y9cW86HLOFZp,"['electronica', 'electro', 'disco', 'funk', 'house', 'indietronica', 'dance rock', 'rock', 'electropop', 'indie rock', 'indie pop', 'electro house', 'space rock', 'pop', 'techno', 'beats', 'disco house', 'indie electronica', 'dance punk', 'background music']",Midnight Juggernauts,Shadows,Shadows
wdAhzJrYFsHfCyCl,"['electropop', 'pop', 'synthpop']",Doda,Riotka,Riotka
wbUm7Kdu4u7CSG39,"['disco', 'singer songwriter', 'classic soul']",Grace Jones,Tomorrow,Portfolio
8crboBOJbzovC9SC,['psychedelic rock'],Ulver,Where Is Yesterday,Childhood's End
gcFfKAX8vSRWOtqk,"['psychedelic rock', 'experimental', 'rock', 'art rock', 'classic rock', 'baroque', 'choral', 'baroque pop', 'psychedelic pop', 'gregorian chant']",The United States of America,Where Is Yesterday,The United States Of America
ExbzgBOxVe8jTGKx,"['pop', 'singer songwriter', 'rumba']",Colbie Caillat,Like Yesterday,All Of You
qSEy5OiW84jPqQb5,"['classic rock', 'rock', 'pop rock', 'pop', 'soft rock', 'hard rock', 'progressive rock', 'new wave', 'easy listening', 'metal', 'alternative rock', 'singer songwriter', 'album rock', 'glam metal']",Foreigner,That Was Yesterday,Agent Provocateur
WWvFN9dBhrf255ar,"['rock', 'alternative rock', 'polish rock', 'punk', 'blues rock', 'ska']",happysad,Taką wodą być,Mów mi dobrze


In [46]:
# show result on jacard distance
getTopValues(id_song, df_jaccardDistance_tfidf).head(11)

Unnamed: 0_level_0,genre,artist,song,album_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
007LIJOPQ4Sb98qV,"['post punk', 'new wave', 'dream pop', 'alternative rock', 'madchester', 'rock']",The Chameleons,Nostalgia,What Does Anything Mean? Basically (2009 Remaster)
fqotTYeTLpnRj3lF,"['pop', 'teen pop', 'rock', 'pop rock', 'disney', 'hip hop', 'grunge', 'alternative rock', 'rap', 'power pop', 'singer songwriter', 'synthpop']",Hilary Duff,So Yesterday,Metamorphosis
E7t3Y9cW86HLOFZp,"['electronica', 'electro', 'disco', 'funk', 'house', 'indietronica', 'dance rock', 'rock', 'electropop', 'indie rock', 'indie pop', 'electro house', 'space rock', 'pop', 'techno', 'beats', 'disco house', 'indie electronica', 'dance punk', 'background music']",Midnight Juggernauts,Shadows,Shadows
wdAhzJrYFsHfCyCl,"['electropop', 'pop', 'synthpop']",Doda,Riotka,Riotka
wbUm7Kdu4u7CSG39,"['disco', 'singer songwriter', 'classic soul']",Grace Jones,Tomorrow,Portfolio
8crboBOJbzovC9SC,['psychedelic rock'],Ulver,Where Is Yesterday,Childhood's End
gcFfKAX8vSRWOtqk,"['psychedelic rock', 'experimental', 'rock', 'art rock', 'classic rock', 'baroque', 'choral', 'baroque pop', 'psychedelic pop', 'gregorian chant']",The United States of America,Where Is Yesterday,The United States Of America
ExbzgBOxVe8jTGKx,"['pop', 'singer songwriter', 'rumba']",Colbie Caillat,Like Yesterday,All Of You
qSEy5OiW84jPqQb5,"['classic rock', 'rock', 'pop rock', 'pop', 'soft rock', 'hard rock', 'progressive rock', 'new wave', 'easy listening', 'metal', 'alternative rock', 'singer songwriter', 'album rock', 'glam metal']",Foreigner,That Was Yesterday,Agent Provocateur
WWvFN9dBhrf255ar,"['rock', 'alternative rock', 'polish rock', 'punk', 'blues rock', 'ska']",happysad,Taką wodą być,Mów mi dobrze


Between cosine similarity and jaccard distance there is no difference on this selected song in the top 10.

## 2. Different feature vectors on cosine similarity

In [47]:
# show result on tf-idf
tfidf_cos = getTopValues(id_song, df_cosineDistance_tfidf).head(11)
tfidf_cos_list = tfidf_cos[['artist','song']].iloc[1:11]
tfidf_cos.head(11)

Unnamed: 0_level_0,genre,artist,song,album_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
007LIJOPQ4Sb98qV,"['post punk', 'new wave', 'dream pop', 'alternative rock', 'madchester', 'rock']",The Chameleons,Nostalgia,What Does Anything Mean? Basically (2009 Remaster)
fqotTYeTLpnRj3lF,"['pop', 'teen pop', 'rock', 'pop rock', 'disney', 'hip hop', 'grunge', 'alternative rock', 'rap', 'power pop', 'singer songwriter', 'synthpop']",Hilary Duff,So Yesterday,Metamorphosis
E7t3Y9cW86HLOFZp,"['electronica', 'electro', 'disco', 'funk', 'house', 'indietronica', 'dance rock', 'rock', 'electropop', 'indie rock', 'indie pop', 'electro house', 'space rock', 'pop', 'techno', 'beats', 'disco house', 'indie electronica', 'dance punk', 'background music']",Midnight Juggernauts,Shadows,Shadows
wdAhzJrYFsHfCyCl,"['electropop', 'pop', 'synthpop']",Doda,Riotka,Riotka
wbUm7Kdu4u7CSG39,"['disco', 'singer songwriter', 'classic soul']",Grace Jones,Tomorrow,Portfolio
8crboBOJbzovC9SC,['psychedelic rock'],Ulver,Where Is Yesterday,Childhood's End
gcFfKAX8vSRWOtqk,"['psychedelic rock', 'experimental', 'rock', 'art rock', 'classic rock', 'baroque', 'choral', 'baroque pop', 'psychedelic pop', 'gregorian chant']",The United States of America,Where Is Yesterday,The United States Of America
ExbzgBOxVe8jTGKx,"['pop', 'singer songwriter', 'rumba']",Colbie Caillat,Like Yesterday,All Of You
qSEy5OiW84jPqQb5,"['classic rock', 'rock', 'pop rock', 'pop', 'soft rock', 'hard rock', 'progressive rock', 'new wave', 'easy listening', 'metal', 'alternative rock', 'singer songwriter', 'album rock', 'glam metal']",Foreigner,That Was Yesterday,Agent Provocateur
WWvFN9dBhrf255ar,"['rock', 'alternative rock', 'polish rock', 'punk', 'blues rock', 'ska']",happysad,Taką wodą być,Mów mi dobrze


In [48]:
# show result on bert
bert_cos = getTopValues(id_song, df_cosineDistance_bert).head(11)
bert_cos_list = bert_cos[['artist','song']].iloc[1:11]
bert_cos.head(11)

Unnamed: 0_level_0,genre,artist,song,album_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
007LIJOPQ4Sb98qV,"['post punk', 'new wave', 'dream pop', 'alternative rock', 'madchester', 'rock']",The Chameleons,Nostalgia,What Does Anything Mean? Basically (2009 Remaster)
tqc0WIiaCUyFpbU0,"['punk', 'rock', 'new wave', 'classic rock', 'alternative rock', 'power pop', 'post punk', 'pop punk', 'madchester']",Buzzcocks,Nostalgia,Operators Manual (Buzzcocks Best)
dqBAZj6zw8CsY4Cf,"['rock', 'christian rock', 'hard rock', 'alternative rock', 'christian hard rock']",Skillet,One Day Too Late,Awake
m71LtJ9HiCiE4ewm,"['rock', 'punk', 'alternative rock', 'pop punk', 'skate punk', 'hard rock', 'grunge', 'hardcore', 'folk', 'pop rock', 'ska', 'alternative metal']",The Offspring,Can't Repeat,Greatest Hits
mTYfNBxLlX14TmTK,"['punk', 'rock', 'rockabilly', 'alternative rock', 'hardcore punk', 'cowpunk', 'punk n roll', 'garage punk', 'metal', 'hardcore', 'hard rock', 'post punk']",Social Distortion,Reach for the Sky,"Sex, Love And Rock 'N' Roll"
dfMDtC99XxwJBTho,['pop'],Take That,These Days,III
n01h8e2sZgEOTzkf,"['punk', 'rock', 'pop punk', 'alternative rock', 'grunge', 'jazz', 'hard rock']",Green Day,I Was There,"1,039 / Smoothed out Slappy Hours"
e2KFUcizqLnGSPcy,"['pop', 'rock', 'pop rock', 'folk', 'folk rock']",Nina Nesbitt,Don't Stop,Peroxide (Deluxe)
mc489rIzGdg83sG4,"['indietronica', 'pop', 'new rave', 'indie pop', 'indie rock']",Friendly Fires,Live Those Days Tonight,Pala
bberSed6okySBCjE,"['rock', 'alternative rock', 'indie rock']",The Classic Crime,All The Memories,"What Was Done, Vol. 1: A Decade Revisited"


In [49]:
# show result on word2vec
word2vec_cos = getTopValues(id_song, df_cosineDistance_word2vec).head(11)
word2vec_cos_list = word2vec_cos[['artist','song']].iloc[1:11]
word2vec_cos.head(11)

Unnamed: 0_level_0,genre,artist,song,album_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
007LIJOPQ4Sb98qV,"['post punk', 'new wave', 'dream pop', 'alternative rock', 'madchester', 'rock']",The Chameleons,Nostalgia,What Does Anything Mean? Basically (2009 Remaster)
qSEy5OiW84jPqQb5,"['classic rock', 'rock', 'pop rock', 'pop', 'soft rock', 'hard rock', 'progressive rock', 'new wave', 'easy listening', 'metal', 'alternative rock', 'singer songwriter', 'album rock', 'glam metal']",Foreigner,That Was Yesterday,Agent Provocateur
j2wn3enEucwIMiGR,"['new wave', 'pop', 'synthpop', 'post punk', 'electronica', 'new romantic', 'rock', 'twee pop', 'electro', 'chill out']",Strawberry Switchblade,Since Yesterday,Strawberry Switchblade
TjO2mDe3AnekjwiE,['alternative rock'],Capital Inicial,Melhor do Que Ontem,Viva a Revolução
wdAhzJrYFsHfCyCl,"['electropop', 'pop', 'synthpop']",Doda,Riotka,Riotka
xBI4ltwyJRfkSZ68,"['rock en espanol', 'pop chileno']",La Ley,Día Cero,Invisible
6ZFvx0Vu2O50WgUZ,"['melodic death metal', 'metal', 'death metal', 'swedish metal', 'metalcore', 'gothenburg metal', 'thrash metal', 'black metal', 'rock', 'progressive metal', 'speed metal', 'alternative metal', 'industrial metal', 'symphonic metal', 'power metal', 'melodic metal', 'melodic power metal', 'emo']",In Flames,Tilt,Hot Topic Exclusive EP
fqotTYeTLpnRj3lF,"['pop', 'teen pop', 'rock', 'pop rock', 'disney', 'hip hop', 'grunge', 'alternative rock', 'rap', 'power pop', 'singer songwriter', 'synthpop']",Hilary Duff,So Yesterday,Metamorphosis
DsaREfFnArvBrzMm,"['pop', 'easy listening', 'soft rock', 'rock', 'romantico', 'acoustic pop']",Carpenters,Only Yesterday,Horizon
kgmUex6GMPGuEq2R,"['mpb', 'samba', 'poetry', 'bossa nova']",Chico Buarque,Apesar de você,Chico Buarque: Favourites - 60 years on


let us get the intersect of the top 10:

In [50]:
# intersect between tf-idf and bert
pd.merge(tfidf_cos_list, bert_cos_list, how ='inner')

Unnamed: 0,artist,song


In [51]:
# intersect between tf-idf and word2vec
pd.merge(tfidf_cos_list, word2vec_cos_list, how ='inner')

Unnamed: 0,artist,song
0,Hilary Duff,So Yesterday
1,Doda,Riotka
2,Foreigner,That Was Yesterday


In [52]:
# intersect between word2vec and bert
pd.merge(word2vec_cos_list, bert_cos_list, how ='inner')

Unnamed: 0,artist,song
0,Capital Inicial,Melhor do Que Ontem


# Save Results

In [53]:
# Restore col width as default
pd.options.display.max_colwidth = 5
pd.options.display.max_columns = 20
# End of program
# Always at the end the program saves the results in to one file, if the file doesn't exist it creates a new one.
saveDataToFile()

# Evaluation of retrieval system

Evaluate your retrieval system using genre as relevance criterion (i.e., a song in the list of retrieved songs (results list) is relevant if any of its genres matches any of the query song's genres). 

In [54]:
def get_genres(field):
    return re.findall(r"\'(.*?)\'", field)


# Check if any genre of the song one is in the genres of song two, if yes returns True
def isResultRelevant(songOneGenres, songTwoGenres):
    return any(item in get_genres(songOneGenres) for item in get_genres(songTwoGenres))

## Precision
Calculate number of relevant results for each query done. 
Each column in the dataframes created is a perfomed query

In [55]:
def precision(dfQueries, topNumber):

    precision = []
    for query in dfQueries.columns.values:
        querySong = getTopValues(query, dfQueries).loc[query]
        top = getTopValues(query, dfQueries).drop(axis=0, index=[query]).head(topNumber)
        # Get if each of the results are relevant, if yes is True
        relevant_results = [isResultRelevant(querySong['genre'], genres) for genres in top['genre'].values]
        precision.append(np.mean(relevant_results))
        
    return np.mean(precision)

In [56]:
def meanAveragePrecision(dfQueries, topNumber):
    
    AP_ = []
    for query in dfQueries.columns.values: # For each query done
        querySong = getTopValues(query, dfQueries).loc[query] # Data of song queried
        top = getTopValues(query, dfQueries).drop(axis=0, index=[query]).head(topNumber) # Top n songs values
        # Get if each of the results are relevant, if yes is True
        # Array containing for each result if it is relevant or not eg. Top5 [True, True, False, True, False]   
        relevant_results = [isResultRelevant(querySong['genre'], genres) for genres in top['genre'].values]
    
        REL = np.sum(relevant_results)
        # print([relevant_results[i] * (np.sum(relevant_results[:i+1]) / (i+1))   for i in range(topNumber)])
        if REL == 0: # Case when there is no relevant result in the top@K
            AP = 0
        else:
            AP = (1/REL) * np.sum([relevant_results[i] * (np.sum(relevant_results[:i+1]) / (i+1))   for i in range(topNumber)])
        
        AP_.append(AP)
        
    return np.mean(AP_)

In [57]:
p10 = meanAveragePrecision(df_innerProductDistance_bert, 10)
p100 = meanAveragePrecision(df_innerProductDistance_bert, 100)
print("MAP@10  :", p10)
print("MAP@100 :", p100)

MAP@10  : 0.6149470899470899
MAP@100 : 0.49664190904709304


In [58]:
p10 = meanAveragePrecision(df_jaccardDistance_word2vec, 10)
p100 = meanAveragePrecision(df_jaccardDistance_word2vec, 100)
print("MAP@10  :", p10)
print("MAP@100 :", p100)

MAP@10  : 0.562347507038242
MAP@100 : 0.5021999875537271


In [59]:
p10 = meanAveragePrecision(df_cosineDistance_tfidf, 10)
p100 = meanAveragePrecision(df_cosineDistance_tfidf, 100)
print("MAP@10  :", p10)
print("MAP@100 :", p100)

MAP@10  : 0.6682931783824639
MAP@100 : 0.6048157259403416


## MRR

In [60]:
def meanReciprocalRank(dfQueries, topNumber):
    RR = []
    for query in dfQueries.columns.values: # For each query done
#         print(query)
        querySong = getTopValues(query, dfQueries).loc[query] # Data of song queried
        top = getTopValues(query, dfQueries).drop(axis=0, index=[query]).head(topNumber) # Top n songs values
        # Get if each of the results are relevant, if yes is True
        # Array containing for each result if it is relevant or not eg. Top5 [True, True, False, True, False]   
        relevant_results = [isResultRelevant(querySong['genre'], genres) for genres in top['genre'].values]
#         print(relevant_results)
        
        if True in relevant_results:
            min_idx_rel = relevant_results.index(True) + 1
            RR.append(1/min_idx_rel)
        else: # Case when there is no relevant result in the top@K
            RR.append(0)
            
        # print(min_idx_rel)
       
    return np.mean(RR)

In [61]:
mrr10 = meanReciprocalRank(df_innerProductDistance_bert, 10)
mrr100 = meanReciprocalRank(df_innerProductDistance_bert, 100)
print("MRR@10  :", mrr10)
print("MRR@100 :", mrr100)

MRR@10  : 0.6666666666666666
MRR@100 : 0.6666666666666666


In [62]:
mrr10 = meanReciprocalRank(df_jaccardDistance_word2vec, 10)
mrr100 = meanReciprocalRank(df_jaccardDistance_word2vec, 100)
print("MRR@10  :", mrr10)
print("MRR@100 :", mrr100)

MRR@10  : 0.677536231884058
MRR@100 : 0.6850244270300372


In [63]:
mrr10 = meanReciprocalRank(df_cosineDistance_tfidf, 10)
mrr100 = meanReciprocalRank(df_cosineDistance_tfidf, 100)
print("MRR@10  :", mrr10)
print("MRR@100 :", mrr100)

MRR@10  : 0.8061224489795918
MRR@100 : 0.8061224489795918


## nDCG

In [64]:
# Gain for the user is considered with the genre, if the song retrieved contains the genre the gain will be 1, 
# if not 0.

# Also the gain could be considered in descending order from k,..., 0

# For example:
# Given the array of results marked as relevant  [1, 0, 0, 1, 1] for @k = @5
# For the first consideration the user gain will be the same d1(1), d2(0), d3(0), d4(1), d5(1)

# Not implemented
# For the second one the user gain could be d1(5), d2(0), d3(0), d4(4), d5(3),
# reducing in one the relevance everytime a new relevant doc appears

def ndcgMean(dfQueries, topNumber):
    ndcg = []
    for query in dfQueries.columns.values: # For each query done
        querySong = getTopValues(query, dfQueries).loc[query] # Data of song queried
        top = getTopValues(query, dfQueries).drop(axis=0, index=[query]).head(topNumber) # Top n songs values
        # Get if each of the results are relevant, if yes is True
        # Array containing for each result if it is relevant or not eg. Top5 [True, True, False, True, False]   
        relevant_results = [isResultRelevant(querySong['genre'], genres) for genres in top['genre'].values]
        sorted_results = sorted(relevant_results, reverse=True)
        # print(relevant_results)
        # print(sorted_results)
        # print(".........")
        dcg = np.sum([ res/np.log2(i+1) if i+1 > 1 else float(res) for i,res in enumerate(relevant_results)])
        # print([ res/np.log2(i+1) if i+1 > 1 else float(res) for i,res in enumerate(relevant_results)])
        idcg = np.sum([ res/np.log2(i+1) if i+1 > 1 else float(res) for i,res in enumerate(sorted_results)])
        if idcg == 0: # Case when there is no relevant result in the top@K
            ndcg.append(0)
        else:
            ndcg.append(dcg / idcg)
#         print(dcg, idcg)
      
        
    return (ndcg, np.mean(ndcg))

In [65]:
_, ndcg10 = ndcgMean(df_innerProductDistance_bert, 10)
_, ndcg100 = ndcgMean(df_innerProductDistance_bert, 100)
print("NDCG@10  :", ndcg10)
print("NDCG@100 :", ndcg100)

NDCG@10  : 0.783060751802182
NDCG@100 : 0.791442108944539


In [66]:
_, ndcg10 = ndcgMean(df_jaccardDistance_word2vec, 10)
_, ndcg100 = ndcgMean(df_jaccardDistance_word2vec, 100)
print("NDCG@10  :", ndcg10)
print("NDCG@100 :", ndcg100)

NDCG@10  : 0.6324209373878787
NDCG@100 : 0.7147653300465455


In [67]:
_, ndcg10 = ndcgMean(df_cosineDistance_tfidf, 10)
_, ndcg100 = ndcgMean(df_cosineDistance_tfidf, 100)
print("NDCG@10  :", ndcg10)
print("NDCG@100 :", ndcg100)

NDCG@10  : 0.7855969047342278
NDCG@100 : 0.8305503150056186


## Create a Test Set since 76115 songs is to much to compute at once.

In [69]:
np.random.seed = 101
indicies_test = np.random.randint(0, len(info), size=100, dtype=int)
indicies_test

array([33242, 58825, 41772, 18438, 68103, 22258, 57629, 11254, 52569,
       65490, 48725, 41582,  2832, 55840, 39186, 69977, 19611, 47349,
       23075, 52546, 44807, 74198,  7097, 49243,  8859, 28290, 34698,
       16083, 31892, 10404, 61823, 56693, 74112, 28578, 36058, 28680,
       35845, 43405, 52903,  7841, 15413,  1601, 26073, 37439,  2840,
       63330,  6346, 40293, 11975, 15719, 74571,  6120, 11201, 53128,
        4793, 67930, 23608, 66746, 47187, 16386, 47070, 14400, 53202,
        1268, 64612, 20173, 68989, 56734, 54160, 44644, 25702,  8935,
       49625, 34132, 12206, 61110,  6639, 40785, 69203, 67148, 36664,
       31263, 58140, 49016, 45172, 13820, 47910, 66012, 65470, 58940,
       61139, 44117, 12688, 44026, 40620, 54072, 52684, 60328, 70572,
       33408])

In [70]:
song_id_test = info.iloc[indicies_test].index.tolist()

### Jaccard formulation with word2vec

In [71]:
for i in tqdm(song_id_test):
    distanceToSongs(i, jaccard_formulation, df_jaccardDistance_word2vec, word2vec)

  return [similarity_function(v, df[index]) for index,song in enumerate(songs)]
  return [similarity_function(v, df[index]) for index,song in enumerate(songs)]
  df[idSong]  = distances
100%|██████████| 100/100 [00:20<00:00,  4.88it/s]


### Jaccard formulation with tfidf

In [72]:
for i in tqdm(song_id_test):
    distanceToSongs(i, jaccard_formulation, df_jaccardDistance_tfidf, tf_idf)

  df[idSong]  = distances
100%|██████████| 100/100 [01:04<00:00,  1.56it/s]


### Jaccard formulation with bert

In [73]:
for i in tqdm(song_id_test):
    distanceToSongs(i, jaccard_formulation, df_jaccardDistance_bert, bert)

  df[idSong]  = distances
100%|██████████| 100/100 [00:52<00:00,  1.91it/s]


### Cosine Similarity with word2vec

In [74]:
for i in tqdm(song_id_test):
    distanceToSongs(i, cosine_similarity, df_cosineDistance_word2vec, word2vec)

  df[idSong]  = distances
100%|██████████| 100/100 [00:17<00:00,  5.78it/s]


### Cosine Similarity with tfidf

In [75]:
for i in tqdm(song_id_test):
    distanceToSongs(i, cosine_similarity, df_cosineDistance_tfidf, tf_idf)

  df[idSong]  = distances
100%|██████████| 100/100 [00:54<00:00,  1.84it/s]


### Cosine Similarity with bert

In [76]:
for i in tqdm(song_id_test):
    distanceToSongs(i, cosine_similarity, df_cosineDistance_bert, bert)

  df[idSong]  = distances
100%|██████████| 100/100 [00:43<00:00,  2.31it/s]


In [77]:
# save Data
saveDataToFile()