## Import Packages

In [87]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


## Perform EDA

In [60]:
df = pd.read_csv('spotify_millsongdata.csv')
print(df.shape)
df.head()

(57650, 4)


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [61]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [62]:
df.drop(['link'], axis=1, inplace=True)

In [63]:
df = df.sample(5000)

In [64]:
df.head()

Unnamed: 0,artist,song,text
51442,Scorpions,Hit Between The Eyes,Late at night when you're all alone \r\nTake ...
426,Alabama,Lonesome Valley,"\r\nWell, you got to walk that lonesome vall..."
49403,Queen Adreena,Ruby,Kill or be killed \r\nI applaud it \r\nGet a...
18181,Roxette,Neverending Love,"Chasing your shadow, \r\nThe senses together,..."
22856,Ace Of Base,Juliet,Juliet \r\nA girl you never should have met ...


## Data Preprocessing

In [67]:
df['text'][426]

"  \r\nWell, you got to walk that lonesome valley  \r\nYou got to walk it by yourselves  \r\nNobody else can walk it for you  \r\nYou got to walk it by yourselves  \r\n  \r\nNow mother walked that lonesome valley  \r\nShe had to walk it by herself  \r\nCause nobody else could walk it for her  \r\nShe had to walk it by herself  \r\n  \r\nNow father walked that lonesome valley  \r\nHe had to walk it by himself  \r\nNobody else could walk it for him  \r\nHe had to walk it by himself  \r\n  \r\nNow John, they say, he was a Baptist  \r\nWhile others say, he was a Jew  \r\nBut the holy bible plainly tells you  \r\nOh, that he was a preacher too  \r\n  \r\nYeah, you got to walk that lonesome valley  \r\nYou got to walk it by yourselves  \r\nAin't nobody else gonna go there for you  \r\nYou got to go there by yourselves  \r\n  \r\nYeah, we got to walk that lonesome valley  \r\nWe got to walk it by ourselves (by ourselves)  \r\nCause nobody else (nobody else) can walk it for us  \r\nWe got to w

In [68]:
df.shape

(5000, 3)

In [69]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex= True)

In [70]:
df.text.tail()

7557     in trinidad there was a family  \r with much c...
18714    on again  \r no i newer knew we could go on an...
55400    when you said you were leaving tomorrow that t...
9386     di di di di di la la la la la la  \r in a spac...
24885    look what i found in my beer  \r a couple of d...
Name: text, dtype: object

### Stop_words and Lemmatization 

In [71]:
nlp = spacy.load("en_core_web_md") 
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [75]:
df['text'] = df['text'].apply(preprocess)

In [76]:
df['text'][7557]

'trinidad family  \r   confusion  \r   mama papa boy grow  \r   want marry wife  \r   meet young girl suit nice  \r   papa ask advice  \r   papa son  \r   girl sister mama know  \r   \r   woe shame scandal family  \r   woe worry shame scandal family  \r   \r   week summer come  \r   soon girl island find  \r   papa day  \r   papa shake head time  \r   marry girl  \r   girl aunty granny know hey  \r   \r   woe shame scandal family  \r   woe worry shame scandal family  \r   \r   mama cover head  \r   tell mama papa  \r   mama laugh son  \r   daddy daddy daddy know  \r   \r   mama cover head  \r   tell mama papa  \r   mama laugh son  \r   daddy daddy daddy know \r \r '

In [77]:
tf = TfidfVectorizer(analyzer='word', stop_words='english')

In [78]:
matrix = tf.fit_transform(df['text'])

In [79]:
similar = cosine_similarity(matrix)

In [81]:
similar[1]

array([0.00000000e+00, 1.00000000e+00, 6.88513863e-04, ...,
       5.70869420e-02, 4.02985911e-02, 0.00000000e+00], shape=(5000,))

In [83]:
df[df['song']=='Lonesome Valley'].index[0]

np.int64(426)

## Recommender Function

In [84]:
def recommender(song_name):
    idx = df[df['song']==song_name].index[0]
    distance = sorted(list(enumerate(similar[idx])), reverse=True, key= lambda x:x[1])
    song=[]
    for s_id in distance[1:6]:
        song.append(df.iloc[s_id[0]].song)
    return song

In [86]:
recommender('Lonesome Valley')

['Every Night',
 'I Just Wanna Stop',
 'Only See',
 'I Want You So Bad',
 'Always Find The Time']

In [89]:
pickle.dump(similar, open("similarity", "wb"))


In [90]:
pickle.dump(df, open("df", "wb"))
