In [1]:
import os
import sys
sys.path.append('..')

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity
from gensim.utils import simple_preprocess

from sentence_transformers import SentenceTransformer, util

from preprocessing import get_dataframe, set_seed

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nazariinyzhnyk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nazariinyzhnyk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
set_seed(42)

In [4]:
df = get_dataframe(os.path.join('..', 'data', 'lyrics'))
df

Unnamed: 0,album,song,text
0,AHardDaysNight,Ill_Be_Back.txt,"You know, if you break my heart I'll go But I'..."
1,AHardDaysNight,Cant_Buy_Me_Love.txt,"Can't buy me love, oh Love, oh Can't buy me lo..."
2,AHardDaysNight,Any_Time_At_All.txt,Any time at all Any time at all Any time at al...
3,AHardDaysNight,A_Hard_Days_Night.txt,It's been a hard day's night And I've been wor...
4,AHardDaysNight,Ill_Cry_Instead.txt,I've got every reason on earth to be mad 'Caus...
...,...,...,...
177,Help,Another_Girl.txt,For I have got another girl Another girl You'...
178,Help,Help.txt,Help! I need somebody Help! Not just anybody H...
179,Help,Ive_Just_Seen_A_Face.txt,I've just seen a face I can't forget the time ...
180,Help,Tell_Me_What_You_See.txt,If you let me take your heart I will prove to ...


In [5]:
# remove duplicated songs from df - see basic_EDA.ipynb for detailed description
df.drop(df[(df.song == "All_You_Need_Is_Love.txt") & (df.album == "YellowSubmarine")].index, inplace=True)
df.drop(df[(df.song == "Yellow_Submarine.txt") & (df.album == "YellowSubmarine")].index, inplace=True)

In [6]:
df

Unnamed: 0,album,song,text
0,AHardDaysNight,Ill_Be_Back.txt,"You know, if you break my heart I'll go But I'..."
1,AHardDaysNight,Cant_Buy_Me_Love.txt,"Can't buy me love, oh Love, oh Can't buy me lo..."
2,AHardDaysNight,Any_Time_At_All.txt,Any time at all Any time at all Any time at al...
3,AHardDaysNight,A_Hard_Days_Night.txt,It's been a hard day's night And I've been wor...
4,AHardDaysNight,Ill_Cry_Instead.txt,I've got every reason on earth to be mad 'Caus...
...,...,...,...
177,Help,Another_Girl.txt,For I have got another girl Another girl You'...
178,Help,Help.txt,Help! I need somebody Help! Not just anybody H...
179,Help,Ive_Just_Seen_A_Face.txt,I've just seen a face I can't forget the time ...
180,Help,Tell_Me_What_You_See.txt,If you let me take your heart I will prove to ...


### Solution 0 (dummy): tf-idf on unprocessed text

In [7]:
search_terms = 'all is love'
documents = list(df.text)

doc_vectors = TfidfVectorizer().fit_transform([search_terms] + documents)

cosine_similarities = cosine_similarity(doc_vectors[0:1], doc_vectors).flatten()
document_scores = [item.item() for item in cosine_similarities[1:]]

In [8]:
# 3 stored elements: 'all' 'is' 'love' in the scope of one vector are compared to all the rest vectors
# with cosine similarity metric. This metric demonstrates how far are vectors from each other.
# So, if we compare query to all of our documents, transformed in the same way (vectorized with tf-idf)
# then documents that will be closer to our query will have higher cosine similarity score 
# (cos(0) = 1 -> same direction of vectors). Vectorization gave us 2407 unique elements.
doc_vectors[0:1]  

<1x2407 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

![cosine sim](https://cs.carleton.edu/cs_comps/0910/netflixprize/final_results/knn/img/knn/cos.png)

In [9]:
cosine_similarities[0]  # - this what happens when comparing same vectors with this metric

1.0

In [10]:
top_df = df.copy()
top_df['similarity_score'] = document_scores
top_df.sort_values('similarity_score', ascending=False).head(5)

Unnamed: 0,album,song,text,similarity_score
73,MagicalMysteryTour,All_You_Need_Is_Love.txt,"Love, love, love Love, love, love Love, love, ...",0.69459
18,AbbeyRoad,The_End.txt,Oh yeah All right Are you gonna be in my dream...,0.516073
15,AbbeyRoad,Because.txt,Ah Because the world is round It turns me on B...,0.407631
42,PleasePleaseMe,Love_Me_Do.txt,"Love, love me do You know I love you I'll alwa...",0.392194
59,Revolver,Tomorrow_Never_Knows.txt,Turn off your mind Relax and float down stream...,0.362012


### Solution 1: tf-idf on preprocessed text

#### If we will implement same algorithm on preprocessed text, results should be better, regarding the logic we'll get less sparsed matrix; cleaning stop-words, punctuation and lemmatization of our documents will result in lower quantity of elements in vectors 

In [11]:
word_tokenize('Love, love, mind) Relax - and (float down stream...')  # results of word_tokenize

['Love',
 ',',
 'love',
 ',',
 'mind',
 ')',
 'Relax',
 '-',
 'and',
 '(',
 'float',
 'down',
 'stream',
 '...']

In [12]:
stop_words = stopwords.words('english')
for word in 'all is love'.split():
    if word in stop_words:
        stop_words.remove(word)
stop_words = set(stop_words)

In [13]:
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`', '(', ')']
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]

# Lemmatize the stop words
tokenizer=LemmaTokenizer()
token_stop = tokenizer(' '.join(stop_words))

search_terms = 'all is love'
documents = list(df.text)

# Create TF-idf model
vectorizer = TfidfVectorizer(stop_words=token_stop,
                             tokenizer=tokenizer)
doc_vectors = vectorizer.fit_transform([search_terms] + documents)

# Calculate similarity
cosine_similarities = linear_kernel(doc_vectors[0:1], doc_vectors).flatten()
document_scores = [item.item() for item in cosine_similarities[1:]]

In [14]:
lm = LemmaTokenizer()  # Example of how lemmatizer modifies words
lm('cats loves')

['cat', 'love']

In [15]:
top_df = df.copy()
top_df['similarity_score'] = document_scores
top_df.sort_values('similarity_score', ascending=False).head(5)

Unnamed: 0,album,song,text,similarity_score
73,MagicalMysteryTour,All_You_Need_Is_Love.txt,"Love, love, love Love, love, love Love, love, ...",0.923018
18,AbbeyRoad,The_End.txt,Oh yeah All right Are you gonna be in my dream...,0.734843
42,PleasePleaseMe,Love_Me_Do.txt,"Love, love me do You know I love you I'll alwa...",0.595745
51,PleasePleaseMe,PS_I_Love_You.txt,As I write this letter Send my love to you Rem...,0.451099
12,AHardDaysNight,And_I_Love_Her.txt,"I give her all my love, that's all I do And if...",0.438177


#### As one can see, with this approach, we get higher similarity scores. "Love Me Do" was on the 4th Place of previous ratings, now it's on the 3rd place. "Because" was dropped off top-5

#### Solution 3: Semantic Similarity with SoftCosineSimilarity metric

##### Vectorize texts with respect to semantic similarity. "cat" is more similar to "dog" then to "truck". 

In [16]:
stop_words = stopwords.words('english')
for word in 'all is love'.split():
    if word in stop_words:
        stop_words.remove(word)
stop_words = set(stop_words)

def preprocess(doc):
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stop_words]


glove = api.load("glove-wiki-gigaword-50")    
similarity_index = WordEmbeddingSimilarityIndex(glove)

search_terms = 'all is love'
documents = list(df.text)

corpus = [preprocess(document) for document in documents]
query = preprocess(search_terms)

dictionary = Dictionary(corpus+[query])
tfidf = TfidfModel(dictionary=dictionary)

similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

In [17]:
query_tf = tfidf[dictionary.doc2bow(query)]

index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus]],
            similarity_matrix)

doc_similarity_scores = index[query_tf]

  Y = np.multiply(Y, 1 / np.sqrt(Y_norm))
  Y = np.multiply(Y, 1 / np.sqrt(Y_norm))


In [18]:
top_df = df.copy()
top_df['similarity_score'] = doc_similarity_scores
top_df.sort_values('similarity_score', ascending=False).head(5)

Unnamed: 0,album,song,text,similarity_score
73,MagicalMysteryTour,All_You_Need_Is_Love.txt,"Love, love, love Love, love, love Love, love, ...",0.987623
69,MagicalMysteryTour,Penny_Lane.txt,In Penny Lane there is a barber showing photog...,0.981254
131,SgtPeppers,Within_You_WIthout_You.txt,We were talking about the space between us all...,0.924626
78,BeatlesForSale,Im_A_Loser.txt,I'm a loser I'm a loser And I'm not what I app...,0.916645
61,Revolver,Love_You_To.txt,"Each day just goes so fast I turn around, it's...",0.897382


##### "All_You_Need_Is_Love" is still first, but places 2,3 now belong to Penny_Lane.txt, Within_You_WIthout_You.txt

#### Solution 4: Semantic Similarity with distilbert and CosineSimilarity

In [19]:
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

corpus = list(df.text)
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

query = 'all is love'

query_embedding = embedder.encode(query, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
cos_scores = cos_scores.cpu()

In [20]:
top_df = df.copy()
top_df['similarity_score'] = cos_scores.numpy()
top_df.sort_values('similarity_score', ascending=False).head(5)

Unnamed: 0,album,song,text,similarity_score
42,PleasePleaseMe,Love_Me_Do.txt,"Love, love me do You know I love you I'll alwa...",0.722462
51,PleasePleaseMe,PS_I_Love_You.txt,As I write this letter Send my love to you Rem...,0.67526
12,AHardDaysNight,And_I_Love_Her.txt,"I give her all my love, that's all I do And if...",0.671278
18,AbbeyRoad,The_End.txt,Oh yeah All right Are you gonna be in my dream...,0.635978
85,BeatlesForSale,Words_Of_Love.txt,Hold me close And tell me how you feel Tell me...,0.597593


##### "All_You_Need_Is_Love" is not even in top-5! Probably "all" and "is" were removed from final corpus

### Let's analyse top-3 songs, selected by algorithms

#### tf-idf selections

In [21]:
list(df[(df.song == "All_You_Need_Is_Love.txt")].text)

["Love, love, love Love, love, love Love, love, love  (Lo-o-ove) There's nothing you can do that can't be done (Lo-o-ove) Nothing you can sing that can't be sung (Lo-o-ove) Nothing you can say, but you can learn how to play the game It's easy  (Lo-o-ove) Nothing you can make that can't be made (Lo-o-ove) No-one you can save that can't be saved (Lo-o-ove) Nothing you can do, but you can learn how to be you in time It's easy  All you need is love All you need is love All you need is love, love Love is all you need  Lo-ove, love Love, love, love Love, love, love  All you need is love (Whoo) All you need is love (Hey) All you need is love, love Love is all you need  (Lo-o-ove) Nothing you can know that isn't known (Lo-o-ove) Nothing you can see that isn't shown (Lo-o-ove) There's nowhere you can be that isn't where you're meant to be It's easy  All you need is love All you need is love All you need is love, love Love is all you need  All you need is love (All together now!) All you need is

In [22]:
list(df[(df.song == "The_End.txt")].text)

['Oh yeah All right Are you gonna be in my dreams Tonight?  Love you, love you, love you, love you Love you, love you, love you, love you Love you, love you, love you, love you Love you, love you, love you, love you Love you, love you, love you, love you Love you, love you, love you, love you  And in the end The love you take Is equal to the love you make']

In [23]:
list(df[(df.song == "Because.txt")].text)

['Ah Because the world is round It turns me on Because the world is round  Ah Because the wind is high It blows my mind Because the wind is high  Ah Love is old, love is new Love is all, love is you  Because the sky is blue It makes me cry Because the sky is blue']

In [24]:
list(df[(df.song == "Love_Me_Do.txt")].text)

["Love, love me do You know I love you I'll always be true So please, love me do Whoa, love me do  Love, love me do You know I love you I'll always be true So please, love me do Whoa, love me do  Someone to love Somebody new Someone to love Someone like you  Love, love me do You know I love you I'll always be true So please, love me do Whoa, love me do  Love, love me do You know I love you I'll always be true So please, love me do Whoa, love me do Yeah, love me do Whoa, love me do Yeah, love me do "]

#### GLOVE selections

In [25]:
list(df[(df.song == "Penny_Lane.txt")].text)

["In Penny Lane there is a barber showing photographs Of every head he's had the pleasure to know And all the people that come and go Stop and say hello  On the corner is a banker with a motorcar The little children laugh at him behind his back And the banker never wears a mac In the pouring rain Very strange  Penny Lane is in my ears and in my eyes There beneath the blue suburban skies I sit and meanwhile back  In Penny Lane, there is a fireman with an hour glass And in his pocket is a portrait of the Queen He likes to keep his fire engine clean It's a clean machine  Penny Lane is in my ears and in my eyes Four of fish and finger pies in summer Meanwhile back  Behind the shelter in the middle of the roundabout A pretty nurse is selling poppies from a tray And though she feels as if she's in a play She is anyway  In Penny Lane the barber shaves another customer We see the banker sitting waiting for a trim And then the fireman rushes in From the pouring rain Very strange  Penny Lane is 

In [26]:
list(df[(df.song == "Within_You_WIthout_You.txt")].text)

["We were talking about the space between us all And the people who hide themselves behind a wall of illusion Never glimpse the truth Then it's far too late When they pass away  We were talking about the love we all could share when we find it To try our best to hold it there With our love With our love we could save the world If they only knew  Try to realize it's all within yourself No one else can make you change And to see you're really only very small And life flows on within you and without you  We were talking about the love that's gone so cold And the people who gain the world and lose their soul They don't know They can't see Are you one of them?  When you've seen beyond yourself Then you may find peace of mind is waiting there And the time will come when you see We're all one and life flows on within you and without you"]

#### distilBERT selections 

In [27]:
list(df[(df.song == "And_I_Love_Her.txt")].text)

["I give her all my love, that's all I do And if you saw my love, you'd love her too I love her  She gives me everything and tenderly The kiss my lover brings, she brings to me And I love her  A love like ours could never die As long as I have you near me  Bright are the stars that shine, dark is the sky I know this love of mine will never die And I love her  Bright are the stars that shine, dark is the sky I know this love of mine will never die And I love her "]

In [28]:
list(df[(df.song == "PS_I_Love_You.txt")].text)

["As I write this letter Send my love to you Remember that I'll always Be in love with you  Treasure these few words 'Til we're together Keep all my love forever P.S. I love you You, you, you  I'll be coming home again to you, love And 'til the day I do, love P.S. I love you You, you, you  As I write this letter Send my love to you Remember that I'll always Be in love with you  Treasure these few words 'Til we're together Keep all my love forever P.S. I love you You, you, you  As I write this letter Send my love to you (you know I want you to) Remember that I'll always Be in love with you  I'll be coming home again to you, love And 'til the day I do, love P.S. I love you You, you, you You, you, you I love you "]

In [29]:
list(df[(df.song == "And_I_Love_Her.txt")].text)

["I give her all my love, that's all I do And if you saw my love, you'd love her too I love her  She gives me everything and tenderly The kiss my lover brings, she brings to me And I love her  A love like ours could never die As long as I have you near me  Bright are the stars that shine, dark is the sky I know this love of mine will never die And I love her  Bright are the stars that shine, dark is the sky I know this love of mine will never die And I love her "]

### What could be done next?


#### try other encoders, lemmatisation/stemming techniques, play with stopwords
#### try other language models in sentence-transformers package