Importing NLTK

In [114]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/neelaychakravarthy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/neelaychakravarthy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/neelaychakravarthy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/neelaychakravarthy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [115]:
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

Functions to find synonyms and remove stop words

In [116]:
def find_synonyms(words):
    synonyms = set()
    
    for word in words:
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name())  # Add the synonyms to the set
    
    return list(synonyms)


In [117]:
def remove_stop_words(query):
    indiv_tokens = word_tokenize(query)
    stop = set(stopwords.words('english'))
    processed_tokens = []

    for word in indiv_tokens:
            if word.lower() not in stop:
                processed_tokens.append(word)
    return processed_tokens


Creating document set

In [118]:
import pandas as pd
import os

df = pd.read_csv('Lyrics_complete.csv', encoding='latin1')
print(df.columns)

songs = 'song_lyrics'

documents = []


if not os.path.exists(songs):
    os.makedirs(songs)

for index, row in df.iterrows():
    song_name = row['Song Name']  
    lyrics = row['Lyrics']  

    documents.append(remove_stop_words(lyrics))
   
    filename = "".join([c for c in song_name if c.isalpha() or c.isdigit() or c==' ']).rstrip()
    

    filepath = os.path.join(songs, f'{filename}.txt')
    
  
    with open(filepath, 'w', encoding='utf-8') as file:
        file.write(lyrics)

Index(['Song Name', 'URL', 'Artist', 'Lyrics'], dtype='object')


In [119]:
songs = []
for doc in documents:
    c = ""
    for word in doc:
        c += " " + word
    songs.append(c)
# print(songs)

In [120]:
query = "i want romance and petals and sex "
q = remove_stop_words(query)
l = find_synonyms(q)

In [121]:
print(q)
print(l)

['want', 'romance', 'petals', 'sex']
['court', 'dally', 'Latin', 'wish', 'deficiency', 'arouse', 'gender', 'romance', 'romanticism', 'philander', 'excite', 'petal', 'privation', 'butterfly', 'sex', 'mash', 'woo', 'sexuality', 'deprivation', 'neediness', 'love_affair', 'sexual_urge', 'solicit', 'flower_petal', 'turn_on', 'coquet', 'flirt', 'Latinian_language', 'need', 'Romance', 'require', 'wind_up', 'sexual_practice', 'sexual_activity', 'want', 'sex_activity', 'wishing', 'coquette', 'desire', 'Romance_language', 'chat_up', 'lack', 'love_story']


In [125]:
c = ""
for word in l:
    c += " " + word
songs.append(c)
    
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
sample = ["sample sample sample sample", "hello it's me"]
tfidf_matrix = tfidf_vectorizer.fit_transform(songs)
print(tfidf_matrix)

  (0, 1243)	0.030563651433216946
  (0, 994)	0.028055852369096322
  (0, 3072)	0.013814378501687357
  (0, 1319)	0.028055852369096322
  (0, 4085)	0.02627654061666212
  (0, 1274)	0.028055852369096322
  (0, 4135)	0.0219894298001073
  (0, 1548)	0.017702318983552477
  (0, 716)	0.030563651433216946
  (0, 4554)	0.030563651433216946
  (0, 2089)	0.0219894298001073
  (0, 1705)	0.01697383167186605
  (0, 375)	0.030563651433216946
  (0, 1744)	0.023768741552541498
  (0, 5032)	0.01697383167186605
  (0, 341)	0.022815320597510194
  (0, 4371)	0.030563651433216946
  (0, 2851)	0.030563651433216946
  (0, 1567)	0.028055852369096322
  (0, 2375)	0.030563651433216946
  (0, 2545)	0.01852820978095537
  (0, 3149)	0.030563651433216946
  (0, 2537)	0.028055852369096322
  (0, 515)	0.061827865147088414
  (0, 2331)	0.020609288382362805
  :	:
  (101, 2657)	0.1545061375283335
  (101, 3052)	0.1545061375283335
  (101, 1180)	0.1545061375283335
  (101, 4005)	0.1545061375283335
  (101, 2761)	0.1545061375283335
  (101, 650)	0.15

In [123]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(tfidf_matrix)

print('Cosine similarity between documents is: \n', cosine_similarities)
print('\n')

num_rows, num_columns = cosine_similarities.shape
ranked_order = np.argsort(-cosine_similarities[:num_rows-1, num_columns-1], axis = 0)

print('Ranked order of documents:', ranked_order)
ranked_songs = []
artists = []
for item in ranked_order:
    ranked_songs.append(df.loc[item]['Song Name'])
    artists.append(df.loc[item]['Artist'])
# print(ranked_songs)

Cosine similarity between documents is: 
 [[1.         0.04906253 0.04175473 ... 0.09291161 0.06177973 0.00334949]
 [0.04906253 1.         0.05154628 ... 0.03256964 0.03240265 0.0017647 ]
 [0.04175473 0.05154628 1.         ... 0.01681601 0.02000859 0.        ]
 ...
 [0.09291161 0.03256964 0.01681601 ... 1.         0.0274431  0.        ]
 [0.06177973 0.03240265 0.02000859 ... 0.0274431  1.         0.        ]
 [0.00334949 0.0017647  0.         ... 0.         0.         1.        ]]


Ranked order of documents: [ 8 26 18  9 83 46 47 89 31 73 11 69 42 13 16 77 55 37 84 49 86 71 81 30
 23 66 34 29 60 90 43 17 51 85 36 39 25 67 19  6 93 45 57  0 21 22 96 59
 78  1 38 35  5 12 92 80 41 62 91 10 82 14 79 44 76  4 87 88  3  2 94 95
 97  7 75 20 15 40 48 98 50 33 52 53 54 32 56 28 58 27 61 63 64 65 24 68
 70 72 74 99]


In [124]:
for i in range(5):
    print(f"{ranked_songs[i]} by {artists[i]}")

i wish i hated you by Ariana Grande
I Want It That Way by Backstreet Boys
TEXAS HOLD 'EM by Beyonce
supernatural by Ariana Grande
Tejano Blue by Cigarettes After Sex
