In [2]:
# import the packages needed
# wordnet of NLTK wordnet (Princeton, 2010) -> large lexical database including nouns, verbs, adjectives and adverbs. 
#

from nltk.corpus import wordnet as wn
import numpy as np
from nltk import word_tokenize, pos_tag
import nltk
import pandas as pd
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#nltk.download('punkt')


In [3]:
# Load in the data from a csv file
df = pd.read_csv('song_artist.csv',header = None) 
df.columns = ['artist', 'song' ]

example of a synset(set of 'cognitive synonyms'): 

In [4]:
wn.synsets('do')

[Synset('bash.n.02'),
 Synset('do.n.02'),
 Synset('doctor_of_osteopathy.n.01'),
 Synset('make.v.01'),
 Synset('perform.v.01'),
 Synset('do.v.03'),
 Synset('do.v.04'),
 Synset('cause.v.01'),
 Synset('practice.v.01'),
 Synset('suffice.v.01'),
 Synset('do.v.08'),
 Synset('act.v.02'),
 Synset('serve.v.09'),
 Synset('do.v.11'),
 Synset('dress.v.16'),
 Synset('do.v.13')]

In [5]:
# The algorithm we use is based on the algorithm proposed by 
# Mihalcea et al. in the paper “Corpus-based and Knowledge-based Measures of Text Semantic Similarity” (https://www.aaai.org/Papers/AAAI/2006/AAAI06-123.pdf)

def penn_to_wn(tag):
    #Convert a Penn Treebank tag to a simplified Wordnet tag
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
    
    if tag.startswith('R'):
        return 'r'
 
    if tag.startswith('J'):
        return 'a'
 

 
    return None
 
def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
 
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None

def sentence_similarity(sentence1, sentence2):
   #compute the sentence similarity using Wordnet


    # Tokenize the word individually and tag them
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for each tagged word
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]
 
    score, count = 0.0, 0

 
    # For each word in the first sentence
    #for synset in synsets1:
    # Get the similarity value of the most similar word in the other sentence
        
    arr_simi_score = []
    for syn1 in synsets1:
        for syn2 in synsets2:
            simi_score = syn1.wup_similarity(syn2)

 
        # Check that the similarity could have been computed
            if simi_score is not None:
                arr_simi_score.append(simi_score)
                best = max(arr_simi_score)
                score += best
                count += 1
 
    # values average
    if count != 0:
        score /= count


    return score

def symmetric_sentence_similarity(sentence1, sentence2):
    #compute the symmetric sentence similarity using Wordnet
    # this means: first compute the sentence similarity of the query to the songtitle and then the similarity between
    # the songtitle and the query, lastly , dividing the total by 2
    return (sentence_similarity(sentence1, sentence2) + sentence_similarity(sentence2, sentence1)) / 2

#save songs in variable
sentences = []
for x in range(1000):
    sentences = sentences + [df['song'][x]]

In [6]:
def last_search(query):
    focus_sentence = query
    rank = []
    result = ()
    result2 = ()
    index = 0
    floatrank = np.asarray(rank, dtype = np.float32)
    floatrank2 = np.asarray(rank, dtype = np.float32)
    for sentence in sentences:
        similarity = symmetric_sentence_similarity(focus_sentence, sentence)
        #print ("Similarity(\"%s\", \"%s\") = %s" % (focus_sentence, sentence, similarity))
        if similarity > 0.8:
            result = result + (sentence, similarity)
            floatrank = np.asarray(similarity)
            floatrank2 =  np.append(floatrank2, floatrank)
            result2 = result2 + (index,)
        index += 1
    print(result)
    print(result2)
    for x in range(len(result2)):
        print(df['artist'][(result2[x])] + "-" + df['song'][(result2[x])])

example result of the algorithm with query 'light':

giving first the name of the song and the similarity according to the algorithm(on a scale of 0 to 1)

Second, the index of the song/artist in the dataset

Last, The artist and song name

small evaluation by some examples below


In [7]:
last_search('light')

('Green Light', 1.0, 'Moonlight', 0.9565217391304348, 'Moonlight', 0.9565217391304348, 'New Light', 1.0, 'All of the Lights', 1.0, 'Starlight', 0.9565217391304348, 'The Light Is Coming (feat. Nicki Minaj)', 1.0, 'Sunshine of Your Love', 0.9565217391304348, 'Light On', 1.0)
(114, 193, 220, 378, 390, 595, 717, 737, 955)
Lorde-Green Light
Foals-Moonlight
xxxtentacion-Moonlight
John Mayer-New Light
Kanye West-All of the Lights
Muse-Starlight
Ariana Grande-The Light Is Coming (feat. Nicki Minaj)
Cream-Sunshine of Your Love
Maggie Rogers-Light On


In [11]:
last_search('blue')

('Pink + White', 0.875, 'Big Blue', 1.0, 'Bodak Yellow', 0.875, 'Blue Jeans', 1.0, 'Brown Eyed Girl', 0.875, 'Blue Monday - 2016 Remastered Version', 1.0)
(86, 379, 399, 450, 697, 733)
Frank Ocean-Pink + White
Vampire Weekend-Big Blue
Cardi B-Bodak Yellow
Lana Del Rey-Blue Jeans
Van Morrison-Brown Eyed Girl
New Order-Blue Monday - 2016 Remastered Version


In [12]:
last_search('happy')

('Happier', 1.0, 'You Seemed so Happy', 0.85, 'Happier', 1.0)
(45, 958, 998)
Marshmello-Happier
The Japanese House-You Seemed so Happy
Ed Sheeran-Happier


In [13]:
last_search('dance')

('Dance to This (feat. Ariana Grande)', 1.0, 'I Wanna Dance with Somebody (Who Loves Me)', 0.9133333333333333, 'Just Dance', 1.0, 'One Dance', 1.0, 'Shut Up and Dance', 1.0, 'Dance, Dance', 1.0)
(268, 396, 468, 471, 604, 609)
Troye Sivan-Dance to This (feat. Ariana Grande)
Whitney Houston-I Wanna Dance with Somebody (Who Loves Me)
Lady Gaga-Just Dance
Drake-One Dance
Walk the Moon-Shut Up and Dance
Fall Out Boy-Dance, Dance


In [14]:
# shows no results , nothing is similar enough to sky in our database
last_search('sky')

()
()


In [19]:
#as '0' is not an actual word it won't return anyting, however, 'zero' does.
last_search('0')

()
()


In [18]:
last_search('zero')

('Nothing Else Matters', 1.0, "There's Nothing Holdin' Me Back", 1.0)
(249, 663)
Metallica-Nothing Else Matters
Shawn Mendes-There's Nothing Holdin' Me Back


In [20]:
last_search('what about an actual sentence?')

()
()


In [22]:
last_search('does not always perform well')

('Do I Wanna Know?', 1.0, "Don't Stop Me Now - Remastered", 1.0, 'Make Up', 1.0, "Don't Call Me Up", 1.0, 'Who Do You Love (with 5 Seconds of Summer)', 1.0, "Don't Stop Believin'", 1.0, 'Look What You Made Me Do', 0.84375, "It's Not Living (If It's Not with You)", 0.90625, 'Make Me Feel', 1.0, 'Love It If We Made It', 0.8125, "(Don't Fear) The Reaper", 1.0, "Boys Don't Cry", 0.9333333333333333, "Don't Speak", 1.0, 'Oops!...I Did It Again', 1.0, 'You Make My Dreams', 1.0, 'Sorry Not Sorry', 0.9166666666666667, 'I Did Something Bad', 1.0, "Hips Don't Lie", 0.9351851851851851, "Don't You (Forget About Me)", 1.0, "I'm Not Okay (I Promise)", 0.8928571428571428, "What I've Done", 1.0, "Don't You Want Me", 1.0, "I'm Not the Only One", 0.8928571428571428, "Don't Stop the Music", 1.0)
(23, 31, 47, 71, 175, 185, 221, 294, 331, 339, 354, 392, 438, 475, 488, 582, 588, 608, 615, 646, 730, 735, 836, 976)
Arctic Monkeys-Do I Wanna Know?
Queen-Don't Stop Me Now - Remastered
Ariana Grande-Make Up
Mabel