In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from scipy import sparse
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
import numpy as np
import nltk
import string


# Load the CSV file into a DataFrame
df = pd.read_json('spot_final.json')

In [13]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
wnl = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\limng\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# truncate middle
def trunc(lyrics):
  lyrics_front = str(lyrics)[:128]
  lyrics_back = str(lyrics)[-384:]
  return lyrics_front + lyrics_back

In [15]:
f_df =  pd.read_json('spot_final.json')
f_df = f_df.reset_index(drop=True)
f_df['lyrics'] = f_df['lyrics'].apply(lambda x : x.translate(str.maketrans("", "", string.punctuation)))
f_df['soup'] = f_df['lyrics'].apply(lambda x : wnl.lemmatize(trunc(x), pos="v")) + f_df['name'] + ' by ' + f_df['artists_names']
f_df['soup'] = f_df['soup'].apply(str)

numerical_features = ['acousticness', 'danceability', 'duration_ms', 
                    'energy', 'instrumentalness', 'loudness', 'liveness', 
                    'speechiness', 'time_signature', 'key', 'valence', 
                    'tempo','popularity', 'artists_mean_popularities',
                    'artists_mean_followers']

# scaling numerical features for similarity matrix
std_scaler = StandardScaler()
f_df[numerical_features] = std_scaler.fit_transform(f_df[numerical_features]).astype(np.float32)
# drop
f_df = f_df.drop(['track_uri', 'artists_genres','n_playlist','lyrics','playlist_uris'],axis= 1)
f_df.head(3)

Unnamed: 0,name,artists_names,popularity,album_type,danceability,energy,key,loudness,mode,speechiness,...,liveness,valence,tempo,duration_ms,time_signature,release_year,artists_mean_popularities,artists_mean_followers,tt_score,soup
0,The Safety Dance,Men Without Hats,0.270529,album,0.055151,0.899171,-0.068183,0.258057,1,-0.450969,...,-0.214522,0.596964,-0.668881,-0.65485,0.235496,1982,0.267889,-0.284085,5.418839,We can dance if we want to We can leave your ...
1,Endless Summer,Grizfolk,0.613427,single,-0.367225,0.474117,-0.349462,0.570241,1,-0.496648,...,2.336627,-0.125196,-1.338745,-0.175897,0.235496,2018,0.136046,-0.292482,4.115897,Take it back its over time Ooh you know its o...
2,Castaway,Zac Brown Band,-1.315375,album,0.277755,0.686644,0.775654,0.770145,1,-0.368127,...,-0.489085,1.715148,-0.636752,-0.433272,0.235496,2015,1.058946,0.138477,4.085013,Castaway Ride the waves like we’re young Caus...


In [16]:
from sklearn.compose import ColumnTransformer

numerical_features.append('tt_score')

num_feat = f_df[numerical_features].astype(pd.SparseDtype("float32",0)).sparse.to_coo().tocsr()
numerical_features.append('soup')
tfidf = TfidfVectorizer(stop_words= 'english', max_features=10000,dtype = np.float32)
soup_tfidf = tfidf.fit_transform(f_df['soup'])
soup_tfidf = soup_tfidf.astype('float32')

final_matrix = sparse.hstack([num_feat,soup_tfidf])

In [17]:
def cosine_similarity_n_space(m1, m2, batch_size=100):
    assert m1.shape[1] == m2.shape[1]
    ret = np.ndarray((m1.shape[0], m2.shape[0]), dtype=np.float32)
    for row_i in range(0, int(m1.shape[0] / batch_size) + 1):
        start = row_i * batch_size
        end = min([(row_i + 1) * batch_size, m1.shape[0]])
        if end <= start:
            break
        rows = m1[start: end]
        sim = cosine_similarity(rows, m2) # rows is O(1) size
        ret[start: end] = sim
    return ret

In [None]:
tfidf_sim_matrix = None
tfidf_sim_matrix = cosine_similarity_n_space(final_matrix,final_matrix,batch_size=100)

In [9]:
#reset index
df.reset_index(drop=True, inplace=True)

# Create a mapping of track URIs to their respective indices in the sampled DataFrame
track_uri_to_index_sampled = pd.Series(df.index, index=df['track_uri'])


In [15]:
# Method to get top 5 songs

# Function to get song recommendations based on song title for the sampled playlists
def get_recommendations_sampled(song_title, tfidf_sim_matrix):
    # Get the index of the song
    song_index = track_uri_to_index_sampled[song_title]
    # print(song_index)
    
    # Get the pairwise similarity scores of all songs with the given song
    sim_scores = list(enumerate(tfidf_sim_matrix[song_index]))
    
    # Sort the songs based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top 5 most similar songs
    sim_scores = sim_scores[1:6]  # Exclude the first song (itself)
    
    # Get the indices of the similar songs
    song_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 similar songs
    return df['track_uri'].iloc[song_indices]

In [16]:
def calculate_accuracy_sampled(playlist_df):
    # Get the songs in the playlist
    songs_in_playlist = playlist_df['track_uri'].values

    # Initialize variables to store accuracy for this playlist and the total number of correct recommendations
    playlist_accuracy = 0
    total_correct_recommendations = 0

    # Set to keep track of recommended songs
    recommended_songs_set = set()

    # Iterate through each song in the playlist
    for song_uri in songs_in_playlist:
        # Recommend the same number as in the playlist, excluding already recommended songs
        recommended_songs = get_recommendations_sampled(song_uri, tfidf_sim_matrix)
        recommended_songs = [song for song in recommended_songs if song not in recommended_songs_set]

        # Calculate the number of correct recommendations (intersection between recommended and actual songs)
        correct_recommendations = len(set(songs_in_playlist) & set(recommended_songs))
        total_correct_recommendations += correct_recommendations

        # Update the set of recommended songs
        recommended_songs_set.update(recommended_songs)

    # Calculate accuracy for this playlist
    playlist_accuracy = total_correct_recommendations / len(songs_in_playlist)

    return playlist_accuracy


In [None]:
# Convert 'playlist_uris' column to tuples
df['playlist_uris'] = df['playlist_uris'].apply(tuple)

# Explode the DataFrame to have one row for each playlist URI in the lists
exploded_df = df.explode('playlist_uris')

# Group the exploded DataFrame by 'playlist_uris'
grouped_df = exploded_df.groupby('playlist_uris')

# Initialize variables to store overall accuracy and the number of playlists processed for the sampled playlists
overall_accuracy_sampled = 0 
num_playlists_sampled = 0

# Iterate through each unique playlist in the sampled dataset
for playlist_uri, playlist_df in grouped_df:
    num_playlists_sampled += 1
    playlist_accuracy_sampled = calculate_accuracy_sampled(playlist_df)
    overall_accuracy_sampled += playlist_accuracy_sampled

# Calculate the overall accuracy score for the sampled playlists
overall_accuracy_sampled /= num_playlists_sampled

print(f"Overall accuracy for sampled playlists: {overall_accuracy_sampled}")