In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv('spotify_millsongdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...


In [3]:
df.shape

(57650, 4)

In [4]:
df=df.sample(n=5000).drop('link',axis=1).reset_index(drop=True)

In [5]:
df['song'][0]

'Rose Garden'

In [6]:
df['text'][0]

"I beg you pardon I never promised you a rose garden  \r\nAlong with the sunshine there's gotta be a little rain sometimes  \r\nWhen you take you got to give so live and let live or let go  \r\nI beg you pardon I never promised you a rose garden  \r\n  \r\nI could promise you things like big diamond rings  \r\nBut you don't find roses growin' on stalks of clover so you better think it\r\nover  \r\nWhen it's sweet talking you could make it come true  \r\nI would give you the world right now on a silver platter but what would it\r\nmatter  \r\n  \r\nSo smile for a while and let's be jolly love shouldn't be so melancholy  \r\nCome along and share the good times while we can  \r\nI beg you pardon I never promised you a rose garden  \r\nAlong with the sunshine girl there's gotta be a little rain sometimes  \r\n  \r\nI could sing you a tune and promise you the moon  \r\nBut if that's what it takes to hold you I'd just as soon let you go  \r\nBut there's one thing I want you to know  \r\nYou 

In [7]:
df['text'] = df['text'] .str.lower().replace(r'[^\w\s]','').replace(r'\n' ,'',regex=True)

In [8]:
df['text'][0]

"i beg you pardon i never promised you a rose garden  \ralong with the sunshine there's gotta be a little rain sometimes  \rwhen you take you got to give so live and let live or let go  \ri beg you pardon i never promised you a rose garden  \r  \ri could promise you things like big diamond rings  \rbut you don't find roses growin' on stalks of clover so you better think it\rover  \rwhen it's sweet talking you could make it come true  \ri would give you the world right now on a silver platter but what would it\rmatter  \r  \rso smile for a while and let's be jolly love shouldn't be so melancholy  \rcome along and share the good times while we can  \ri beg you pardon i never promised you a rose garden  \ralong with the sunshine girl there's gotta be a little rain sometimes  \r  \ri could sing you a tune and promise you the moon  \rbut if that's what it takes to hold you i'd just as soon let you go  \rbut there's one thing i want you to know  \ryou better look before you leap still water 

In [9]:
import nltk
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()


def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [ps.stem(w) for w in tokens]
        
    return " ".join(stemming)

In [10]:
tokenization('this is my word loving loved')

'thi is my word love love'

In [11]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [12]:
df['text']

0       i beg you pardon i never promis you a rose gar...
1       i know some peopl say that i 'm the devil in d...
2       roll the dice , roll them twice . my , my can ...
3       is the music of grove skin rock soak in the di...
4       not sure if you know thi but when we first met...
                              ...                        
4995    wound heart i can not save you from yourself t...
4996    the white line of tracer for the facer of the ...
4997    drug stab time well i got work on the ford lin...
4998    everybodi tell me , we love your song your sou...
4999    how do you rate the morn sun after a long and ...
Name: text, Length: 5000, dtype: object

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
tfid = TfidfVectorizer(stop_words='english') 
matrix = tfid.fit_transform(df['text'])

In [23]:
matrix.shape

(5000, 17631)

In [24]:
similarity = cosine_similarity(matrix)

In [25]:
similarity[0]

array([1.        , 0.06714523, 0.        , ..., 0.04513007, 0.05506112,
       0.02712368])

In [26]:
df['song'][0]

'Rose Garden'

In [27]:
df[df['song'] == "Rose Garden"]

Unnamed: 0,artist,song,text
0,Glen Campbell,Rose Garden,i beg you pardon i never promis you a rose gar...


In [28]:
def recommendation(song):
    idx= df[df['song']== song].index[0]
    distances= sorted(list(enumerate(similarity[idx])),reverse=False, key=lambda x:x[1])
    
    songs  = []
    for i in distances[1:21]:
        songs.append(df.iloc[i[0]].song)
    return songs

In [29]:
recommendation("Rose Garden")

['Pangako',
 'Aa Gaye',
 'Once In A Lifetime',
 'Un Amore Per Sempre',
 'Hallelujah',
 'Wandering Shepherd',
 'Sleepwalk',
 'Pick A Bale Of Cotton',
 'Cover It With Gas And Set It On Fire',
 'Dati',
 'Broken Flag',
 "The Devil's Orchard",
 'The Beating Of A High School Spanish Teacher',
 'Sa Ugoy Ng Duyan',
 'Alipin',
 'Jesus Christ',
 'Geneva Farewell',
 'Les Yeux Ouverts',
 'Fiesta De La Noche',
 'Rat Fink']

In [30]:
import pickle
pickle.dump(similarity, open("similarity.pkl", "wb"))
pickle.dump(df, open("df.pkl", "wb"))

