In [4]:
import os
import json

import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [6]:
df = pd.read_csv('data/spotify_millsongdata.csv')

In [10]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


Top artists

In [11]:
top_artists = df['artist'].value_counts().head(10)
print("\nTop 10 Artists:")
print(top_artists)


Top 10 Artists:
artist
Donna Summer        191
Gordon Lightfoot    189
Bob Dylan           188
George Strait       188
Loretta Lynn        187
Alabama             187
Cher                187
Reba Mcentire       187
Chaka Khan          186
Dean Martin         186
Name: count, dtype: int64


Data subsample to reduce computation times

In [12]:
df = df.sample(10000)

df = df.drop('link', axis=1).reset_index(drop=True)

Download nltk data

In [13]:
# download nltk data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mitja\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Mitja\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mitja\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [14]:
stop_words = set(stopwords.words('english'))

In [15]:
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [16]:
df['cleaned_text'] = df['text'].apply(preprocess_text)

In [17]:
df.head()

Unnamed: 0,artist,song,text,cleaned_text
0,Zoegirl,Plain,He made you feel plain \r\nWhen he forgot you...,made feel plain forgot name let tell something...
1,Eminem,Love Me,[Obie Trice] \r\nYou don't see me in the hood...,obie trice dont see hood cause im man niggas i...
2,Reba Mcentire,Don't Forget Your Way Home,Now you spread your wings to fly \r\nThere ar...,spread wings fly new things must try matter go...
3,Bruno Mars,Long Distance,There's only so many songs that I can sing \r...,theres many songs sing pass time im running th...
4,Eurythmics,Possessed,Got tired of being gifted and tormented \r\nS...,got tired gifted tormented signed sealed deliv...


In [18]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

In [19]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [21]:
cosine_sim

array([[1.        , 0.03890438, 0.08273981, ..., 0.05423727, 0.11948811,
        0.00543657],
       [0.03890438, 1.        , 0.06566851, ..., 0.10810012, 0.06401946,
        0.04570384],
       [0.08273981, 0.06566851, 1.        , ..., 0.00436736, 0.04566322,
        0.020479  ],
       ...,
       [0.05423727, 0.10810012, 0.00436736, ..., 1.        , 0.01244013,
        0.00583269],
       [0.11948811, 0.06401946, 0.04566322, ..., 0.01244013, 1.        ,
        0.03553207],
       [0.00543657, 0.04570384, 0.020479  , ..., 0.00583269, 0.03553207,
        1.        ]], shape=(10000, 10000))

In [20]:
def recommend_songs(song_name, cosine_sim=cosine_sim, df=df, top_n=5):
    # Find the index of the song
    idx = df[df['song'].str.lower() == song_name.lower()].index
    if len(idx) == 0:
        return "Song not found in the dataset!"   # can be reworked to compare for songs not present in the dataset
    idx = idx[0]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]

    # Get song indices
    song_indices = [i[0] for i in sim_scores]

    # Return top n similar songs
    return df[['artist', 'song']].iloc[song_indices]
     
