In [1]:
import numpy as np
import pandas as pd

In [2]:
'''
Loading the data and explicitly assigning index value (this step is not required I did this because when using df.sample()
method, there was index problem)
'''
df = pd.DataFrame({"index" : list(range(0,57650))})
df = pd.concat([df, pd.read_csv("Spotify_dataset.csv")], axis = "columns")
df.head()
df = df[df["index"] < 10000]
df.head()

Unnamed: 0,index,artist,song,link,text
0,0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [8]:
# Removing the link column as it won't be of any use to us
df.drop(columns = ["link"], inplace = True)

In [10]:
df.head()

Unnamed: 0,index,artist,song,text
0,0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd..."
1,1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl..."
2,2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...
3,3,ABBA,Bang,Making somebody happy is a question of give an...
4,4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [11]:
df.shape

(10000, 4)

In [12]:
# Checking for null values
df.isnull().sum()

index     0
artist    0
song      0
text      0
dtype: int64

In [13]:
'''
ntlk is a very famous natural language processing library, we need to remove unnecesarry words from the lyrics that won't add
meaning to the lyric or are not keywords
'''
import nltk
from nltk.corpus import stopwords

In [14]:
def remove_stopwords(text):
    stop = stopwords.words("english")
    res = ""
    for i in text.split():
        i = i.lower()
        if i not in stop:
            res += i + " "
    res.rstrip()
    return res

In [15]:
df["text"] = df["text"].apply(remove_stopwords)

In [17]:
'''
After analyzing some lyrics I found out some more words getting repeated in the lyrics that won't be contributing any meaning.
Hence I compiled some that I could find like [verse] or lyric x 3
'''
def remove_chorus(text):
    res = ""
    elements = ["chorus", "[chorus]", "verse", "[verse]", "x", "2", "3"]
    for i in text.split():
        if i not in elements:
            res += i + " "
    res.rstrip()
    return res

In [18]:
df["text"] = df["text"].apply(remove_chorus)

In [16]:
# Joining the name of artist to ensure that artists with the same first names are not grouped together
df["artist"] = df["artist"].apply(lambda x: "".join(x.split()))

In [19]:
# Creating a column of tags using the combined artist names and adjusted lyrics which will be used to group songs together
df["tags"] = df["artist"] + df["text"]

In [20]:
df.drop(columns = ['text'], inplace = True)

In [21]:
df.head()

Unnamed: 0,index,artist,song,tags
0,0,ABBA,Ahe's My Kind Of Girl,"ABBAlook face, wonderful face means something ..."
1,1,ABBA,"Andante, Andante","ABBAtake easy me, please touch gently like sum..."
2,2,ABBA,As Good As New,ABBAi'll never know go put lousy rotten show b...
3,3,ABBA,Bang,ABBAmaking somebody happy question give take l...
4,4,ABBA,Bang-A-Boomerang,ABBAmaking somebody happy question give take l...


In [22]:
# Max features parameters takes in the number of which the top (parameter) words will be included in making the vector
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500,stop_words='english')

In [23]:
vector = cv.fit_transform(df['tags']).toarray()

In [24]:
# Will almost always return a sparse matrix represting the frequency in the song for the top frequency words
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 3, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 2]], dtype=int64)

In [25]:
vector.shape

(10000, 500)

In [26]:
# In order to find out the similarity between 2 vectors, cosine similarity has been used here
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
similarity = cosine_similarity(vector)

In [28]:
similarity[0]

array([1.        , 0.        , 0.03947368, ..., 0.10650039, 0.        ,
       0.1440976 ])

In [30]:
def recommend(song):
    index = list(df[df["song"] == song]["index"])[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    print("You may also like :- ")
    print()
    for i in distances[1:6]:
        obj = df.iloc[i[0]]
        print(f"{obj.song} by {obj.artist}")

In [31]:
recommend("Mexicali Rose")

You may also like :- 

Never Make You Cry by EricClapton
Tears For You by Judds
Leaving On A Jet Plane by JohnDenver
Big Ball's In Cowtown by GeorgeStrait
My Melancholy Baby by BingCrosby
