###Importing libraries

In [115]:
import pandas as pd
import numpy as np
import nltk
import pickle
from nltk.stem.porter import PorterStemmer   ### this is used in tokenization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


###Installing the requirements

In [116]:
!pip install nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [117]:
df = pd.read_csv ("spotify_millsongdata.csv")

In [118]:
df.shape

(57650, 4)

In [119]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [120]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [121]:
df=df.sample(5000).drop('link', axis=1).reset_index(drop = True)

In [122]:
df.shape

(5000, 3)

In [123]:
df.head(5)

Unnamed: 0,artist,song,text
0,Utopia,Crystal Ball,"Well, I've heard it, I've heard it all \r\nI'..."
1,Bosson,Baby Don't Cry,"Baby don't cry over me, save your love \r\nBa..."
2,Indigo Girls,Come Down In Time,In the quiet silent seconds I \r\nI turned of...
3,Janis Joplin,"Bye, Bye Baby","Bye, bye-bye, baby, bye-bye. \r\nI gotta be s..."
4,Iron Maiden,Lord Of The Flies,I don't care for this world anymore \r\nI jus...


In [124]:
df['text'][0]

"Well, I've heard it, I've heard it all  \r\nI've heard it all before  \r\nI don't need no gypsy  \r\nI don't need no crystal ball  \r\n  \r\nSince you started messin' round on me darlin'  \r\nNow the phone's ringing off the wall  \r\nAnd everybody says it's so  \r\nI got a broken heart comin' and they want me to know  \r\n  \r\nAnd I've heard it all  \r\nI don't need a crystal ball.  \r\nI got a lady across the hallway  \r\nI got another one way across town  \r\n  \r\nAll the people at your birthday party  \r\nYou should hear the way they're puttin' me down?  \r\nIt seems the whole world's out to spy  \r\nIt's like the CIA meets the FBI  \r\n  \r\nAnd I've heard it all  \r\nI don't need a crystal ball  \r\nWell I've heard it all  \r\nI don't need a crystal ball  \r\n  \r\nWell I've heard it all  \r\nIt ain't no surprise to me  \r\nI don't watch no television  \r\nI don't read the daily news  \r\n  \r\nI'm afraid if I listen to the radio  \r\nI'll hear about the latest things you do  \

In [125]:
#lets remove the unwanted  space and \r

##Text cleaning / Text preprocessing


In [126]:
df['text'].str.lower().replace(r'\w\s',' ').replace(r'\n',' ',regex=True)

0       well, i've heard it, i've heard it all  \r i'v...
1       baby don't cry over me, save your love  \r bab...
2       in the quiet silent seconds i  \r i turned off...
3       bye, bye-bye, baby, bye-bye.  \r i gotta be se...
4       i don't care for this world anymore  \r i just...
                              ...                        
4995    he said i'm gonna buy this place and burn it d...
4996    sun comes up on this new morning  \r shifting ...
4997    and if i could hold you where would you belong...
4998    i used to sing to her  \r it used to make her ...
4999    don't get much slicker than fab i strap up wit...
Name: text, Length: 5000, dtype: object

In [127]:
df['text']=df['text'].str.lower().replace(r'\w\s',' ').replace(r'\n',' ',regex=True)

##Text tokenization

In [128]:
df

Unnamed: 0,artist,song,text
0,Utopia,Crystal Ball,"well, i've heard it, i've heard it all \r i'v..."
1,Bosson,Baby Don't Cry,"baby don't cry over me, save your love \r bab..."
2,Indigo Girls,Come Down In Time,in the quiet silent seconds i \r i turned off...
3,Janis Joplin,"Bye, Bye Baby","bye, bye-bye, baby, bye-bye. \r i gotta be se..."
4,Iron Maiden,Lord Of The Flies,i don't care for this world anymore \r i just...
...,...,...,...
4995,Coldplay,A Rush Of Blood To The Head,he said i'm gonna buy this place and burn it d...
4996,Michael Jackson,For All Time,sun comes up on this new morning \r shifting ...
4997,Ocean Colour Scene,You've Got It Bad,and if i could hold you where would you belong...
4998,Bryan White,That's Another Song,i used to sing to her \r it used to make her ...


In [129]:
df.tail(5)

Unnamed: 0,artist,song,text
4995,Coldplay,A Rush Of Blood To The Head,he said i'm gonna buy this place and burn it d...
4996,Michael Jackson,For All Time,sun comes up on this new morning \r shifting ...
4997,Ocean Colour Scene,You've Got It Bad,and if i could hold you where would you belong...
4998,Bryan White,That's Another Song,i used to sing to her \r it used to make her ...
4999,Fabolous,Rap City Freestyle,don't get much slicker than fab i strap up wit...


In [130]:
stemmer = PorterStemmer()            #it is used to reducing word into word stem (one kind of short form used in machine to understand)

In [131]:
def token(txt):
    tokens = nltk.word_tokenize(txt)
    a = [stemmer.stem(w) for w in tokens]
    return " ".join(a)

In [132]:
df['text'].apply(lambda x: token(x))

0       well , i 've heard it , i 've heard it all i '...
1       babi do n't cri over me , save your love babi ...
2       in the quiet silent second i i turn off the li...
3       bye , bye-by , babi , bye-by . i got ta be see...
4       i do n't care for thi world anymor i just want...
                              ...                        
4995    he said i 'm gon na buy thi place and burn it ...
4996    sun come up on thi new morn shift shadow , a s...
4997    and if i could hold you where would you belong...
4998    i use to sing to her it use to make her smile ...
4999    do n't get much slicker than fab i strap up wi...
Name: text, Length: 5000, dtype: object

In [133]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [134]:
matrix = tfid.fit_transform(df['text'])

In [149]:
similer = cosine_similarity(matrix)

In [150]:
similer[0]

array([1.        , 0.11893754, 0.11069022, ..., 0.09650565, 0.06413255,
       0.05132914])

In [152]:
df[df['song']=='I Like Death']

Unnamed: 0,artist,song,text


##Recommender Function

In [157]:
def recommender(song_name):
    song_matches = df[df['song'] == song_name]
    if len(song_matches) > 0:
        idx = song_matches.index[0]
        distance = sorted(list(enumerate(similer[idx])), reverse=True, key=lambda x: x[1])
        song = []
        for s_id in distance[1:21]:
            song.append(df.iloc[s_id[0]]['song'])
        return song
    else:
        return "Song not found in the dataset or no similar songs available."


In [158]:
recommender("That's Another Song")

["It's All Over Now",
 "It's All Over Now",
 'Man Of Steel',
 'Can We Go Back?',
 'The Things I Used To Do',
 "I Don't Care",
 'Remember',
 'The Way You Do The Things You Do',
 "And You Don't Remember",
 'Could I Have This Kiss Forever',
 'Sign Of The Times',
 'Baby Hold On To Me',
 'Someone That I Used To Love',
 "We Don't Talk Anymore",
 "Should've Been Me",
 'If I Could Turn Back Time (Live)',
 'Two Voices One Song',
 'To Think I Used To Love You',
 "It's The Same Old Song",
 'I Miss You Like Crazy']

In [159]:
pickle.dump(similer,open("similarity","wb"))


In [160]:
pickle.dump(df,open("df","wb"))