In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df_song = pd.read_pickle('./data/song_cleaned.pkl')

In [3]:
df_song_sample = df_song.sample(frac=0.01, random_state=42).reset_index(drop=True)
df_song_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9991 entries, 0 to 9990
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   song_num     9991 non-null   int32 
 1   title        9991 non-null   object
 2   release      9991 non-null   object
 3   artist_name  9991 non-null   object
 4   year         9991 non-null   int32 
dtypes: int32(2), object(3)
memory usage: 312.3+ KB


## Create vector of features

In [4]:
df_song_sample['combined_features'] = df_song_sample['artist_name'] + " " + df_song_sample['year'].astype(str)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_song_sample['combined_features'])

## Compute the cosine similarity matrix 

In [5]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [6]:
# creata dictionary with key song_num and value index
song_num_index = df_song_sample['song_num'].to_dict()
index_song_num = {v: k for k, v in song_num_index.items()}

In [7]:
# Function to get recommendations
def get_recommendations(song_index, cosine_sim):
    # Get the pairwsie similarity scores of all songs with that song
    sim_scores = list(enumerate(cosine_sim[song_index]))

    # Sort the songs based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar songs
    sim_scores = sim_scores[1:11]

    # Get the song indices
    song_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar songs
    return song_indices


In [10]:
# Example usage
song_num = 540868
song_index = index_song_num[song_num]
recommendations = get_recommendations(song_index, cosine_sim)
print(recommendations)


[3256, 9139, 8214, 226, 2858, 247, 747, 8669, 824, 5878]


In [11]:
# filter the dataframe to get the song_num and title of the recommended songs
df_song_sample[df_song_sample.index.isin(recommendations)][['song_num', 'title']]

Unnamed: 0,song_num,title
226,98406,Intro/I'll Take Jesus
247,73908,Hold On I'm Coming
747,386919,Turn Around
824,222724,Dust Ballad
2858,221067,The Giraffe (Minimaril Cupcake Remix)
3256,563401,I Got a Man (Re-Recorded / Remastered)
5878,305368,Here I am
8214,205695,Escape
8669,75811,Soon Sealed
9139,653887,About Mine (feat. Mo'Betta_ A-Yes & Fatal Lucc...


In [None]:
# save the cosine similarity matrix and the song_num_index dictionary
import pickle
model = {'cosine_sim': cosine_sim, 'song_num_index': song_num_index, 'index_song_num': index_song_num}
pickle.dump(model, open('./models/item_similarity.pkl', 'wb'))