In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [25]:
data=pd.read_csv('spotify_millsongdata.csv')

In [26]:
data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [27]:
data['artist'].value_counts()

artist
Donna Summer        191
Gordon Lightfoot    189
Bob Dylan           188
George Strait       188
Loretta Lynn        187
                   ... 
Ungu                  2
U-Kiss                1
Zoe                   1
Zed                   1
X-Treme               1
Name: count, Length: 643, dtype: int64

In [28]:
data.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [29]:
data=data.sample(2000).drop('link', axis=1).reset_index(drop=True)

In [30]:
data.head()

Unnamed: 0,artist,song,text
0,System Of A Down,Needles,I cannot disguise \r\nAll the stomach pains ...
1,Planetshakers,Cry Holy,"Wondrous mystery, in the pages of history, \r..."
2,Arlo Guthrie,Lay Down Little Doggies,"\r\nCHORUS: \r\nLAY DOWN, LITTLE DOGGIES, L..."
3,Lana Del Rey,Pin Up Galore,"Baby I have become, \r\nBaby I have become so..."
4,Incubus,Oil And Water,You and I are like oil and water \r\nWe've be...


### Text Processing

In [31]:
data['text']=data['text'].str.lower().replace(r'^\w\s','').replace(r'\n','',regex=True) # '^\w\s' for lower case

In [32]:
stemmer=PorterStemmer()

In [33]:
stemmer.stem('beautiful')

'beauti'

In [34]:
def token(text):
    token=nltk.word_tokenize(text)
    a=[stemmer.stem(w) for w in token]
    return " ".join(a)

In [35]:
token('you are gorgeous,beautiful')

'you are gorgeou , beauti'

In [36]:
data['text'].apply(lambda x:token(x))

0       i can not disguis all the stomach pain and the...
1       wondrou mysteri , in the page of histori , of ...
2       choru : lay down , littl doggi , lay down we '...
3       babi i have becom , babi i have becom someon a...
4       you and i are like oil and water we 've been t...
                              ...                        
1995    it do n't mean a thing if it ai n't got that s...
1996    there 's an empti street in an empti town ther...
1997    sampl the onli thing that burn in hell is the ...
1998    i ca n't understand , she let go of my hand an...
1999    broken thread wake earli i hear myself breath ...
Name: text, Length: 2000, dtype: object

In [37]:
tfid=TfidfVectorizer(stop_words='english') #nltk-- stopwords

In [38]:
matrix=tfid.fit_transform(data['text'])

In [39]:
similarity=cosine_similarity(matrix)

In [40]:
similarity[0]

array([1.        , 0.00914256, 0.00444169, ..., 0.00185678, 0.00903112,
       0.00904598])

In [41]:
data[data['song'] == '1970']

Unnamed: 0,artist,song,text


In [42]:
def recommendation(song_df):
    idx = data[data['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(data.iloc[m_id[0]].song)
        
    return songs

In [20]:
recommendation('1970')

['Blow It All Away',
 'Baby I',
 'Heavy Whispers',
 'Baby Baby',
 'Baby One More Time',
 'Secret Love',
 'Glow',
 'Please, Please, Please',
 'Through The Night',
 'Should I, Would I, Could I',
 'Baby Baby',
 "And My Baby's Gone",
 'I Love You Too Much',
 "There's Something In The Air",
 "Can't Take My Eyes Off Of You",
 'Keep It Right There',
 "Don't Tease Me",
 'Scream',
 "Don't Let Me Down",
 'Everything Will Be Alright']

In [43]:
pickle.dump(similarity,open('similarity.pkl','wb'))

In [22]:
pickle.dump(data,open('data.pkl','wb'))