In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from scipy import sparse
import numpy as np
import nltk
import string


In [2]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
wnl = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to C:\Users\Malcolm
[nltk_data]     Sng\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Content Based Filtering Initial
- Applied a rating to every song, which is just a linear sum of all numerical features, ratings will be used for evaluation


In [3]:
df = pd.read_json('spot_final.json')
df = df.reset_index(drop=True)
# df['lyrics'] = df['lyrics'].apply(lambda x : x.translate(str.maketrans("", "", string.punctuation)))
# df['lyrics'] = df['lyrics'].apply(lambda x : wnl.lemmatize(trunc(x), pos="v"))
# df['lyrics'][df['lyrics'].apply(len) < 512] = df['name'] + ' by ' + df['artists_names']
# df['lyrics'] = df['lyrics'].apply(str)
# same range as tiktok score
mm_scaler = MinMaxScaler(feature_range= (-5,5))
numerical_features = ['acousticness', 'danceability', 'duration_ms', 
                    'energy', 'instrumentalness', 'loudness', 'liveness', 
                    'speechiness', 'time_signature', 'key', 'valence', 
                    'tempo','popularity', 'artists_mean_popularities',
                    'artists_mean_followers']
num_feat = mm_scaler.fit_transform(df[numerical_features])
# Sum of all rating features for each song
numerical_features.append('rating')
df['rating'] = num_feat.sum(axis=1)
# same range as tiktok score
df['rating'] = mm_scaler.fit_transform(df['rating'].array.reshape(-1,1))

# scaling numerical features for similarity matrix
std_scaler = StandardScaler()
df[numerical_features] = std_scaler.fit_transform(df[numerical_features])
# drop
df = df.drop(['track_uri', 'artists_genres','n_playlist', 'tt_score','lyrics','playlist_uris'],axis= 1)
df

Unnamed: 0,name,artists_names,popularity,album_type,danceability,energy,key,loudness,mode,speechiness,...,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,release_year,artists_mean_popularities,artists_mean_followers,rating
0,The Safety Dance,Men Without Hats,0.270529,album,0.055151,0.899171,-0.068183,0.258057,1,-0.450969,...,-0.568391,-0.214522,0.596964,-0.668881,-0.654850,0.235496,1982,0.267889,-0.284085,-0.225001
1,Endless Summer,Grizfolk,0.613427,single,-0.367225,0.474117,-0.349462,0.570241,1,-0.496648,...,-0.568391,2.336626,-0.125196,-1.338745,-0.175897,0.235496,2018,0.136046,-0.292482,-0.190103
2,Castaway,Zac Brown Band,-1.315375,album,0.277755,0.686644,0.775654,0.770145,1,-0.368127,...,-0.568391,-0.489085,1.715148,-0.636752,-0.433272,0.235496,2015,1.058946,0.138477,0.494623
3,Islands In the Stream,Dolly Parton,1.342086,compilation,0.312002,-0.746946,0.775654,-0.486751,1,-0.446323,...,-0.568388,-0.640095,1.027931,2.741206,0.236791,0.235496,1982,1.058946,-0.042410,1.669950
4,21 Summer,Brothers Osborne,1.127775,album,0.466112,1.103970,-0.912020,0.414681,1,-0.478067,...,-0.568314,-0.557726,0.565904,-0.284258,-0.157160,0.235496,2016,0.663417,-0.165526,0.001706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58315,Down Under,Men At Work,1.684984,compilation,1.048306,0.717557,1.619491,0.821407,0,-0.536133,...,-0.489588,-0.841441,1.621966,-0.442136,-0.089218,0.235496,1996,0.751312,-0.172470,1.659890
58316,You Can Call Me Al,Paul Simon,0.999188,compilation,1.225248,0.466389,-0.068183,0.056556,1,-0.371224,...,-0.535783,-0.622363,1.466663,0.253042,0.492624,0.235496,2007,0.927103,-0.045549,0.898265
58317,Dancing In the Dark,Bruce Springsteen,1.985020,album,-0.201699,1.304904,-1.193299,0.654672,0,-0.451743,...,-0.568391,-0.071520,0.084463,0.923729,0.125095,0.235496,1984,1.190788,0.392760,0.468635
58318,The Power Of Love,Huey Lewis & The News,1.856433,compilation,1.173878,0.868258,-0.068183,0.748860,1,-0.492777,...,-0.568303,-0.592046,1.897630,-0.062188,0.051973,0.235496,2006,0.707365,-0.127926,1.319596


convert dataframe with numerical columns to sparse matrix

In [4]:
numerical_features.append('rating')
initial_matrix = df[numerical_features].astype(pd.SparseDtype("float32",0)).sparse.to_coo().tocsr()
initial_matrix.shape

(58320, 17)

In [27]:
def cosine_similarity_n_space(m1, m2, batch_size=100):
    assert m1.shape[1] == m2.shape[1]
    ret = np.ndarray((m1.shape[0], m2.shape[0]), dtype=np.float32)
    for row_i in range(0, int(m1.shape[0] / batch_size) + 1):
        start = row_i * batch_size
        end = min([(row_i + 1) * batch_size, m1.shape[0]])
        if end <= start:
            break 
        rows = m1[start: end]
        sim = cosine_similarity(rows, m2) # rows is O(1) size
        ret[start: end] = sim
    return ret

In [28]:
sim_matrix = None
sim_matrix = cosine_similarity_n_space(initial_matrix,initial_matrix)


### Recommend song function:
- `threshold`: minimum similarity score to be recommended
- `strict_k`: if `True`, recommend k songs even if some are below threshold 

In [11]:
def recommend_songs(data, song, similarity_matrix = sim_matrix, top_k=10, threshold = 0.5, strict_k = True):
  index = data.index[data['name'] == song][0]
  score_arr = similarity_matrix[index].toarray()
  if not strict_k:
    score_arr =   np.fromiter((x for x in score_arr if x > 0.5), dtype=score_arr.dtype)
  reversed_score_arr = np.argsort(score_arr)[::-1]
  top_k_songs = reversed_score_arr[0][1:top_k + 1]
  return data.iloc[top_k_songs]

def recommend_songs_index(index, similarity_matrix = sim_matrix, top_k=10, threshold = 0.5):
  score_arr = similarity_matrix[index].toarray()
  score_arr =   np.fromiter((x for x in score_arr[0] if x > threshold), dtype=score_arr.dtype)
  top_k = min(top_k,len(score_arr))
  reversed_score_arr = np.argsort(score_arr)[::-1]
  return reversed_score_arr[1:top_k + 1]


In [12]:
top_k= recommend_songs(df, 'Dancing In the Dark', sim_matrix,top_k=10)
top_k

Unnamed: 0,name,artists_names,popularity,album_type,danceability,energy,key,loudness,mode,speechiness,...,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,release_year,artists_mean_popularities,artists_mean_followers,rating
48200,Background for Making Cocktails at Home,Restaurant Jazz Music Universe,-1.186788,album,0.055151,-1.670471,1.056933,-0.861017,0,-0.423097,...,1.548132,-0.540566,0.177645,-1.550774,-1.179906,0.235496,2021,-1.709753,-0.310872,-0.176247
57470,Over the Rainbow,Various Artists,-1.315375,compilation,-0.447134,-1.759346,1.338212,-0.702264,1,-0.468776,...,1.880254,-0.511965,-1.12302,-1.979409,-0.227619,0.235496,2013,-2.149229,-0.070276,-0.526399
36637,Anatomy of a Car Crash,Becomer,-1.315375,single,-0.013342,-0.936288,1.056933,-1.100831,0,0.7514,...,-0.568391,-0.391844,-0.365916,-1.756122,0.451901,0.235496,2018,-1.753701,-0.310865,-1.197261
8349,Wool Gloves,Various Artists,-1.315375,compilation,0.186431,-1.593189,1.619491,-0.466352,1,-0.258187,...,2.073489,-0.517685,-1.328797,-1.306978,-0.665272,0.235496,2019,-2.149229,-0.070276,-0.301508
1133,Mornings,The Boy from the South,-1.315375,single,0.506067,-1.141086,1.619491,-0.305826,0,-0.36503,...,-0.517365,-0.271723,-0.183435,-0.513208,-0.730478,0.235496,2020,-0.874749,-0.310703,-0.281552
37111,solitude reminisce,Sptmbr Yngstr.,-0.543854,single,0.186431,-1.30338,1.338212,-0.987487,0,0.116537,...,0.557804,-0.409004,-0.804648,-1.337066,-0.131158,0.235496,2022,-1.446068,-0.310747,-0.15998
34461,High Horse,Chasen Wayne,-1.229651,single,0.323417,-1.770939,1.619491,-0.847182,0,-0.466453,...,-0.521592,-0.494805,-0.862887,-0.828932,-0.584979,0.235496,2021,-2.061334,-0.310865,-1.045673
37177,Come to Stay,Oribu,-1.315375,album,0.814287,-1.272467,1.056933,-0.472206,0,-0.172248,...,0.898984,-0.612066,-1.173493,-1.239824,-0.287038,0.235496,2020,-0.962644,-0.31051,-0.879883
49766,Living in a Transient Dream,Various Artists,-1.315375,compilation,0.363372,-1.268603,1.338212,-1.433946,1,-0.445549,...,1.185817,-0.431884,-1.635521,-0.910834,-0.281166,0.235496,2018,-2.149229,-0.070276,-0.597519
26358,After Youve Gone,Tuba Fats,-1.058201,album,0.42045,-1.724569,1.338212,-1.235993,1,-0.39987,...,1.04693,-0.374684,0.437778,-0.398618,0.351707,0.235496,2006,-2.017386,-0.31082,0.05807


### Get relevant songs
- A relevant song is one that has is close in rating to the target song  
i.e. target_rating - a <= `rating` <= target_rating + a  

In [13]:
# gets all relevant song indexes for 1 song
def get_relevant(df,index, a = 0.01):
    assert a > 0
    song_score = df.iloc[index]['rating']
    relevant = df[df['rating'].between(song_score - a, song_score + a)].index
    return relevant
    

## Evaluation Metrics
- Precision, Recall, MAP, F1

In [14]:
# gets precision for 1 song

def precision_for_song(df,index):
    
    relevant_items = get_relevant(df,index,0.1)
    recommended_items = recommend_songs_index(index, top_k = len(relevant_items), threshold= 0.1)
    # Calculate the intersection of recommended_items and relevant_items
    true_positive = len(set(recommended_items).intersection(set(relevant_items)))
    
    # Calculate the total number of recommended items
    total_recommended_items = len(recommended_items)

    # Calculate precision
    precision_value = true_positive / total_recommended_items if total_recommended_items > 0 else 0
    return precision_value

# get model precision
def model_precision(df):
    total = 0
    df_dict = df.to_dict('records')
    for row in range(len(df_dict)):
        total += precision_for_song(df,row)
    return total/len(df)
    
# get recall for 1 song
def recall_for_song(df,index):
    relevant_items = get_relevant(df,index,0.1)
    recommended_items = recommend_songs_index(index, top_k = len(relevant_items),threshold=0.1)
    # Calculate the intersection of recommended_items and relevant_items
    true_positive = len(set(recommended_items).intersection(set(relevant_items)))

    # Calculate the total number of relevant items
    total_relevant_items = len(relevant_items)

    # Calculate recall
    recall_value = true_positive / total_relevant_items if total_relevant_items > 0 else 0
    return recall_value
# get model recall
def model_recall(df):
    total = 0
    df_dict = df.to_dict('records')
    for row in range(len(df_dict)):
        total += recall_for_song(df,row)
    return total/len(df)
    

def model_f1_score(precision,recall):
    return (2*precision*recall)/(precision+recall)





In [30]:
precision = model_precision(df)
print(f'Model Precision: {precision}')
recall = model_recall(df)
print(f'Model Recall: {recall}')



Model Precision: 0.01417993408951809
Model Recall: 0.01417993408951809


In [None]:
f1 = model_f1_score(precision, recall)
print(f'Model F1: {f1}')

## Content Based Filtering with Tiktok Score and Lyrics
- Create word soup
- Use `tiktok_score`

In [18]:
# truncate middle
def trunc(lyrics):
  lyrics_front = str(lyrics)[:128]
  lyrics_back = str(lyrics)[-384:]
  return lyrics_front + lyrics_back

In [19]:
f_df = pd.read_json('spot_final.json')
f_df = f_df.reset_index(drop=True)
f_df['lyrics'] = f_df['lyrics'].apply(lambda x : x.translate(str.maketrans("", "", string.punctuation)))
f_df['soup'] = f_df['lyrics'].apply(lambda x : wnl.lemmatize(trunc(x), pos="v")) + f_df['name'] + ' by ' + f_df['artists_names']
f_df['soup'] = f_df['soup'].apply(str)

numerical_features = ['acousticness', 'danceability', 'duration_ms', 
                    'energy', 'instrumentalness', 'loudness', 'liveness', 
                    'speechiness', 'time_signature', 'key', 'valence', 
                    'tempo','popularity', 'artists_mean_popularities',
                    'artists_mean_followers']

# scaling numerical features for similarity matrix
std_scaler = StandardScaler()
f_df[numerical_features] = std_scaler.fit_transform(f_df[numerical_features]).astype(np.float32)
# drop
f_df = f_df.drop(['track_uri', 'artists_genres','n_playlist','lyrics','playlist_uris'],axis= 1)
f_df.head(3)

Unnamed: 0,name,artists_names,popularity,album_type,danceability,energy,key,loudness,mode,speechiness,...,liveness,valence,tempo,duration_ms,time_signature,release_year,artists_mean_popularities,artists_mean_followers,tt_score,soup
0,The Safety Dance,Men Without Hats,0.270529,album,0.055151,0.899171,-0.068183,0.258057,1,-0.450969,...,-0.214522,0.596964,-0.668881,-0.65485,0.235496,1982,0.267889,-0.284085,5.418839,We can dance if we want to We can leave your ...
1,Endless Summer,Grizfolk,0.613427,single,-0.367225,0.474117,-0.349462,0.570241,1,-0.496648,...,2.336627,-0.125196,-1.338745,-0.175897,0.235496,2018,0.136046,-0.292482,4.115897,Take it back its over time Ooh you know its o...
2,Castaway,Zac Brown Band,-1.315375,album,0.277755,0.686644,0.775654,0.770145,1,-0.368127,...,-0.489085,1.715148,-0.636752,-0.433272,0.235496,2015,1.058946,0.138477,4.085013,Castaway Ride the waves like we’re young Caus...


In [20]:
from sklearn.compose import ColumnTransformer

numerical_features.append('tt_score')

num_feat = f_df[numerical_features].astype(pd.SparseDtype("float32",0)).sparse.to_coo().tocsr()
numerical_features.append('soup')
tfidf = TfidfVectorizer(stop_words= 'english', max_features=100000,dtype = np.float32)
soup_tfidf = tfidf.fit_transform(f_df['soup'])
# print(soup_tfidf.shape,type(soup_tfidf))
soup_tfidf = soup_tfidf.astype('float32')
# ct = ColumnTransformer([('soup_tfidf', tfidf,'soup')], remainder='passthrough')

# final_matrix = ct.fit_transform(f_df[numerical_features])
final_matrix  = sparse.hstack([num_feat,soup_tfidf])

In [22]:
sim_matrix = None
sim_matrix = cosine_similarity_n_space(final_matrix,final_matrix)

## Evaluation

In [23]:
f_df['rating'] = f_df['tt_score']
f_precision = model_precision(f_df)
print(f'Final Model Precision: {f_precision}')
f_recall = model_recall(f_df)
print(f'Final Model Recall: {f_recall}')



Final Model Precision: 0.04104516635415255
Final Model Recall: 0.04104516635415255


In [24]:
f_f1 = model_f1_score(f_precision, f_recall)
print(f'Final Model F1: {f_f1}')

Final Model F1: 0.04104516635415255
