In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from scipy import sparse
import numpy as np
import random

def cosine_similarity_n_space(m1, m2, batch_size=100):
    assert m1.shape[1] == m2.shape[1]
    ret = np.ndarray((m1.shape[0], m2.shape[0]), dtype=np.float32)
    for row_i in range(0, int(m1.shape[0] / batch_size) + 1):
        start = row_i * batch_size
        end = min([(row_i + 1) * batch_size, m1.shape[0]])
        if end <= start:
            break 
        rows = m1[start: end]
        sim = cosine_similarity(rows, m2) # rows is O(1) size
        ret[start: end] = sim
    return ret

# Load the CSV file into a DataFrame
file_path = r'spot_final.json'
df = pd.read_json(file_path)

# Select relevant columns for content-based filtering
numerical_features = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'loudness', 'liveness', 'speechiness', 'time_signature', 'key', 'valence', 'tempo']

# Fill missing values in numerical columns with their mean values
df[numerical_features] = df[numerical_features].fillna(df[numerical_features].mean())

# Normalize numerical features to have values between 0 and 1
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

numerical_features_matrix_sampled = df[numerical_features].values
numerical_features_sparse_sampled = sparse.csr_matrix(numerical_features_matrix_sampled)
numerical_similarity_sampled = cosine_similarity_n_space(numerical_features_sparse_sampled,numerical_features_sparse_sampled, batch_size= 100)

#reset index
df.reset_index(drop=True, inplace=True)

# Create a mapping of track URIs to their respective indices in the DataFrame
track_uri_to_index_sampled = pd.Series(df.index, index=df['track_uri'])


In [15]:
#Method to get top 5 songs

# Function to get song recommendations based on song title for the unique playlists
def get_recommendations_sampled(song_title, numerical_similarity_sampled):
    # Get the index of the song
    song_index = track_uri_to_index_sampled[song_title]
    # print(song_index)
    
    # Get the pairwise similarity scores of all songs with the given song
    sim_scores = list(enumerate(numerical_similarity_sampled[song_index]))
    
    # Sort the songs based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top 5 most similar songs
    sim_scores = sim_scores[1:6]  # Exclude the first song (itself)
    
    # Get the indices of the similar songs
    song_indices = [i[0] for i in sim_scores]
    
    # Return the top 5 similar songs
    return df['track_uri'].iloc[song_indices]

In [16]:
def calculate_accuracy_sampled(playlist_df):
    # Get the songs in the playlist
    songs_in_playlist = playlist_df['track_uri'].values

    # Initialize variables to store accuracy for this playlist and the total number of correct recommendations
    playlist_accuracy = 0
    total_correct_recommendations = 0

    # Set to keep track of recommended songs
    recommended_songs_set = set()

    # Iterate through each song in the playlist
    for song_uri in songs_in_playlist:
        # Recommend the same number as in the playlist, excluding already recommended songs
        recommended_songs = get_recommendations_sampled(song_uri, numerical_similarity_sampled)
        recommended_songs = [song for song in recommended_songs if song not in recommended_songs_set]

        # Calculate the number of correct recommendations (intersection between recommended and actual songs)
        correct_recommendations = len(set(songs_in_playlist) & set(recommended_songs))
        total_correct_recommendations += correct_recommendations

        # Update the set of recommended songs
        recommended_songs_set.update(recommended_songs)

    # Calculate accuracy for this playlist
    playlist_accuracy = total_correct_recommendations / len(songs_in_playlist)

    return playlist_accuracy


In [17]:

# Convert 'playlist_uris' column to tuples
df['playlist_uris'] = df['playlist_uris'].apply(tuple)

# Explode the DataFrame to have one row for each playlist URI in the lists
exploded_df = df.explode('playlist_uris')

# Group the exploded DataFrame by 'playlist_uris'
grouped_df = exploded_df.groupby('playlist_uris')

# Initialize variables to store overall accuracy and the number of playlists processed for the playlists
overall_accuracy_sampled = 0 
num_playlists_sampled = 0

# Iterate through each unique playlist in the dataset
for playlist_uri, playlist_df in grouped_df:
    num_playlists_sampled += 1
    playlist_accuracy_sampled = calculate_accuracy_sampled(playlist_df)
    overall_accuracy_sampled += playlist_accuracy_sampled

# Calculate the overall accuracy score for the playlists
overall_accuracy_sampled /= num_playlists_sampled

print(f"Overall accuracy for sampled playlists: {overall_accuracy_sampled}")

Overall accuracy for sampled playlists: 0.11638808995189474


In [18]:
overall_accuracy_sampled

0.11638808995189474

In [19]:
num_playlists_sampled

861