In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
data=pd.read_csv('spotify_millsongdata.csv')

In [3]:
data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
data['artist'].value_counts()

artist
Donna Summer        191
Gordon Lightfoot    189
Bob Dylan           188
George Strait       188
Loretta Lynn        187
                   ... 
Ungu                  2
U-Kiss                1
Zoe                   1
Zed                   1
X-Treme               1
Name: count, Length: 643, dtype: int64

In [5]:
data.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [6]:
data=data.sample(10000).drop('link', axis=1).reset_index(drop=True)

In [7]:
data.head()

Unnamed: 0,artist,song,text
0,Kenny Rogers,I Trust You,God said \r\nMary \r\nPure and \r\nHoly \r...
1,NOFX,Pods And Gods,Martian men are coming to Earth \r\nThey're a...
2,Iggy Pop,1970,Out of my mind on Saturday night \r\n1970 rol...
3,Korn,Somebody Someone,I can't stand to let you win \r\nI'm just wat...
4,Beautiful South,Speak To Me,Speak to me \r\nSpeak to me \r\nSpeak to me ...


### Text Processing

In [8]:
data['text']=data['text'].str.lower().replace(r'^\w\s','').replace(r'\n','',regex=True) # '^\w\s' for lower case

In [9]:
stemmer=PorterStemmer()

In [10]:
stemmer.stem('beautiful')

'beauti'

In [11]:
def token(text):
    token=nltk.word_tokenize(text)
    a=[stemmer.stem(w) for w in token]
    return " ".join(a)

In [12]:
token('you are gorgeous,beautiful')

'you are gorgeou , beauti'

In [13]:
data['text'].apply(lambda x:token(x))

0       god said mari pure and holi with thi babi i tr...
1       martian men are come to earth they 're abduct ...
2       out of my mind on saturday night 1970 rollin '...
3       i ca n't stand to let you win i 'm just watch ...
4       speak to me speak to me speak to me tell me al...
                              ...                        
9995    at time it seem too rough , thing just drag me...
9996    it 's been a long time sinc i came around been...
9997    who love me even though i 'm crazi and noth th...
9998    time 's rollin ' forward i 'm gettin ' bore la...
9999    do you feel what i feel , see what i see , hea...
Name: text, Length: 10000, dtype: object

In [14]:
tfid=TfidfVectorizer(stop_words='english') #nltk-- stopwords

In [15]:
matrix=tfid.fit_transform(data['text'])

In [16]:
similarity=cosine_similarity(matrix)

In [17]:
similarity[0]

array([1.        , 0.052374  , 0.07183266, ..., 0.00158731, 0.        ,
       0.        ])

In [18]:
data[data['song'] == '1970']

Unnamed: 0,artist,song,text
2,Iggy Pop,1970,out of my mind on saturday night \r1970 rolli...


In [19]:
def recommendation(song_df):
    idx = data[data['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(data.iloc[m_id[0]].song)
        
    return songs

In [20]:
recommendation('1970')

['Blow It All Away',
 'Baby I',
 'Heavy Whispers',
 'Baby Baby',
 'Baby One More Time',
 'Secret Love',
 'Glow',
 'Please, Please, Please',
 'Through The Night',
 'Should I, Would I, Could I',
 'Baby Baby',
 "And My Baby's Gone",
 'I Love You Too Much',
 "There's Something In The Air",
 "Can't Take My Eyes Off Of You",
 'Keep It Right There',
 "Don't Tease Me",
 'Scream',
 "Don't Let Me Down",
 'Everything Will Be Alright']

In [21]:
pickle.dump(similarity,open('similarity.pkl','wb'))

In [22]:
pickle.dump(data,open('data.pkl','wb'))