In [228]:
import pandas as pd

In [229]:
user_raw = pd.read_csv('../dataset/spotify_dataset.csv',on_bad_lines='skip')
song_raw = pd.read_csv('../dataset/spotify_millsongdata.csv')
user_raw.columns = user_raw.columns.str.replace('"', '').str.strip()
song_raw = song_raw.drop('link', axis=1)

In [230]:
song_raw

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...
...,...,...,...
57645,Ziggy Marley,Good Old Days,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,northern star \r\nam i frightened \r\nwhere ...


In [231]:
user_raw

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010
...,...,...,...,...
12891675,2302bf9c64dc63d88a750215ed187f2c,Mötley Crüe,Wild Side,iPhone
12891676,2302bf9c64dc63d88a750215ed187f2c,John Lennon,Woman,iPhone
12891677,2302bf9c64dc63d88a750215ed187f2c,Tom Petty,You Don't Know How It Feels,iPhone
12891678,2302bf9c64dc63d88a750215ed187f2c,Tom Petty,You Wreck Me,iPhone


In [232]:
merge_df = pd.merge(user_raw, song_raw, how='left', left_on=['trackname', 'artistname'], right_on=['song', 'artist'])
merge_df = merge_df.dropna()
merge_df = merge_df.drop_duplicates()
merge_df.reset_index(drop=True, inplace=True)
merge_df = merge_df.drop(columns=['artist', 'song','text'])
merge_df

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Band On The Run,HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Dance Tonight,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Crowded House,Don't Dream It's Over,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Crowded House,Fall At Your Feet,HARD ROCK 2010
...,...,...,...,...
980823,2302bf9c64dc63d88a750215ed187f2c,Soundgarden,The Day I Tried To Live,iPhone
980824,2302bf9c64dc63d88a750215ed187f2c,Def Leppard,Too Late For Love,iPhone
980825,2302bf9c64dc63d88a750215ed187f2c,Jimi Hendrix,Voodoo Child (Slight Return),iPhone
980826,2302bf9c64dc63d88a750215ed187f2c,Metallica,Welcome Home (Sanitarium),iPhone


# Data preprocessing

In [233]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [234]:
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = re.sub(r'\r\n', ' ', text)
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

def preprocess_combined(row):
    artist = preprocess_text(row['artist'])
    song = preprocess_text(row['song'])
    text = preprocess_text(row['text'])
    return f"{artist} {song} {text}"


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\11580\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\11580\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [235]:
song_raw['text1'] = song_raw.apply(preprocess_combined, axis=1)

In [236]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm

train_df = pd.DataFrame()
test_df = pd.DataFrame()

user_groups = merge_df.groupby(['user_id','playlistname'])
total_groups = len(user_groups)

for user_id, user_data in tqdm(user_groups, total=total_groups, desc="Processing users"):
    if len(user_data) > 1:
        train_data, test_data = train_test_split(user_data, test_size=0.2, random_state=42)
        train_df = pd.concat([train_df, train_data])
        test_df = pd.concat([test_df, test_data])

Processing users: 100%|██████████| 13730/13730 [05:41<00:00, 40.23it/s]


In [237]:
train_df = pd.merge(train_df, song_raw, how='left', left_on=['trackname', 'artistname'], right_on=['song', 'artist'])
train_df = train_df.dropna()
train_df = train_df.drop_duplicates()
train_df.reset_index(drop=True, inplace=True)
train_df

Unnamed: 0,user_id,artistname,trackname,playlistname,artist,song,text,text1
0,00055176fea33f6e027cd3302289378b,Little Mix,Little Me,favs,Little Mix,Little Me,She lives in the shadow of a lonely girl \r\n...,littl mix littl live shadow lone girl voic qui...
1,00055176fea33f6e027cd3302289378b,Demi Lovato,Made In The USA,favs,Demi Lovato,Made In The USA,Our love runs deep like a chevy \r\nIf you fa...,demi lovato made usa love run deep like chevi ...
2,00055176fea33f6e027cd3302289378b,Imagine Dragons,I Bet My Life,favs,Imagine Dragons,I Bet My Life,[Verse] \r\nI knew I took the path \r\nThat ...,imagin dragon bet life vers knew took path wou...
3,00055176fea33f6e027cd3302289378b,Bruno Mars,Treasure,favs,Bruno Mars,Treasure,"Give me all, give me all, give me all your att...",bruno mar treasur give give give attent babi g...
4,00055176fea33f6e027cd3302289378b,One Direction,Act My Age,favs,One Direction,Act My Age,[Verse 1: Niall] \r\nWhen I'm fat and old \r...,one direct act age vers niall im fat old kid t...
...,...,...,...,...,...,...,...,...
778867,fff77dadf8528083c920b9c018847e8b,Fall Out Boy,I've Got All This Ringing In My Ears And None ...,Liked from Radio,Fall Out Boy,I've Got All This Ringing In My Ears And None ...,You're a canary \r\nI'm a coal mine \r\n'Cau...,fall boy ive got ring ear none finger your can...
778868,fff77dadf8528083c920b9c018847e8b,Lana Del Rey,Lolita,Liked from Radio,Lana Del Rey,Lolita,Would you be mine? \r\nWould you be my baby t...,lana del rey lolita would mine would babi toni...
778869,fff77dadf8528083c920b9c018847e8b,Lana Del Rey,Cola,Liked from Radio,Lana Del Rey,Cola,My pussy tastes like Pepsi Cola \r\nMy eyes a...,lana del rey cola pussi tast like pepsi cola e...
778870,fff77dadf8528083c920b9c018847e8b,Mariah Carey,Hero,Liked from Radio,Mariah Carey,Hero,There's a hero \r\nIf you look inside your he...,mariah carey hero there hero look insid heart ...


In [238]:
test_df = pd.merge(test_df, song_raw, how='left', left_on=['trackname', 'artistname'], right_on=['song', 'artist'])
test_df = test_df.dropna()
test_df = test_df.drop_duplicates()
test_df.reset_index(drop=True, inplace=True)
test_df

Unnamed: 0,user_id,artistname,trackname,playlistname,artist,song,text,text1
0,00055176fea33f6e027cd3302289378b,Ed Sheeran,Friends,favs,Ed Sheeran,Friends,"We're not, no we're not friends, nor have we e...",ed sheeran friend friend ever tri keep secret ...
1,00055176fea33f6e027cd3302289378b,OneRepublic,I Lived,favs,OneRepublic,I Lived,Hoping you take that jump \r\nBut don't fear ...,onerepubl live hope take jump dont fear fall h...
2,00055176fea33f6e027cd3302289378b,Avril Lavigne,Girlfriend,favs,Avril Lavigne,Girlfriend,"[Chorus] \r\nHey, hey \r\nYou, you \r\nI do...",avril lavign girlfriend choru hey hey dont lik...
3,00055176fea33f6e027cd3302289378b,Little Mix,They Just Don't Know You,favs,Little Mix,They Just Don't Know You,(JADE) \r\nDaddy doesn't think \r\nThat you'...,littl mix dont know jade daddi doesnt think yo...
4,00055176fea33f6e027cd3302289378b,Green Day,21 Guns,favs,Green Day,21 Guns,"Do you know what's worth fighting for, \r\nWh...",green day gun know what worth fight worth die ...
...,...,...,...,...,...,...,...,...
201240,fff77dadf8528083c920b9c018847e8b,Travis,Sing,Liked from Radio,Travis,Sing,"Baby, you've been going so crazy \r\nLately, ...",travi sing babi youv go crazi late noth seem g...
201241,fff77dadf8528083c920b9c018847e8b,Fall Out Boy,"What A Catch, Donnie",Liked from Radio,Fall Out Boy,"What A Catch, Donnie",I got troubled thoughts \r\nAnd the self-este...,fall boy catch donni got troubl thought selfes...
201242,fff77dadf8528083c920b9c018847e8b,Justin Timberlake,My Love,Liked from Radio,Justin Timberlake,My Love,If I wrote you a symphony \r\nJust to say how...,justin timberlak love wrote symphoni say much ...
201243,fff77dadf8528083c920b9c018847e8b,Britney Spears,Circus,Liked from Radio,Britney Spears,Circus,There's only two types of people in the world ...,britney spear circu there two type peopl world...


In [239]:
tfidf = TfidfVectorizer(
    max_features = 10000
)
tfidf.fit_transform(song_raw['text1'])

<57650x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 3534566 stored elements in Compressed Sparse Row format>

In [240]:
grouped_df = train_df.groupby(['user_id','playlistname']).agg({
    'text1': ' '.join
}).reset_index()
grouped_df['index'] = grouped_df.index
grouped_df

Unnamed: 0,user_id,text1,index
0,00055176fea33f6e027cd3302289378b,littl mix littl live shadow lone girl voic qui...,0
1,0007f3dd09c91198371454c608d47f22,evanesc heart broken wander til end time torn ...,1
2,000b0f32b5739f052b9d40fcc5c41079,mariah carey one child angel appear highest on...,2
3,000c11a16c89aa4b14b328080f5954ee,pharrel william gush gush gush gush tell gush ...,3
4,00123e0f544dee3ab006aa7f1e5725a7,talk head slipperi peopl time rollin fall face...,4
...,...,...,...
13014,ffe11226cdea81a2db9262c0ec7f5d71,nickelback savin prison gate wont open hand kn...,13014
13015,ffe32d5412269f3041c58cbf0dde3306,bill wither love day wake morn love sunlight h...,13015
13016,fff60baf392613ed33f745b89a9b38f7,linkin park war one two one two three four the...,13016
13017,fff616055993498d6127f3f467cf9f2b,elli gould wish stay cant speak anoth languag ...,13017


In [241]:
tfidf_text1_y = tfidf.transform(grouped_df['text1'])
merged_train = pd.merge(train_df, grouped_df, how='left', left_on=['user_id','playlistname'], right_on=['user_id','playlistname'])

In [None]:

def similarity(row):
    tfidf_text1_x = tfidf.transform([row['text1_x']])
    similarity_score = cosine_similarity(tfidf_text1_x, tfidf_text1_y[int(row['index'])]).flatten()[0]
    return similarity_score
merged_train['similarity'] = merged_train.apply(lambda row: similarity(row), axis=1)

In [None]:
similarity_threshold = merged_train.groupby('user_id')['similarity'].mean().reset_index()
similarity_sd = merged_train.groupby('user_id')['similarity'].std().reset_index()
grouped_df = pd.merge(grouped_df, similarity_threshold, how='left', left_on=['user_id'], right_on=['user_id'])
grouped_df = pd.merge(grouped_df, similarity_sd, how='left', left_on=['user_id'], right_on=['user_id'])
grouped_df = grouped_df.rename(columns={'similarity_x': 'mean','similarity_y': 'sd'})
grouped_df.to_csv('../dataset/grouped_df.csv', index=False)

In [None]:
grouped_df = pd.read_csv('../dataset/grouped_df.csv')

In [None]:
import matplotlib.pyplot as plt


first_user_id = merged_train['user_id'].iloc[10]
first_user_data = merged_train[merged_train['user_id'] == first_user_id]

similarity_scores = first_user_data['similarity']
mean_similarity = similarity_scores.mean()
plt.figure(figsize=(10, 6))
plt.hist(similarity_scores, bins=20, edgecolor='k', alpha=0.7)
plt.axvline(mean_similarity, color='r', linestyle='dashed', linewidth=2, label=f'Mean: {mean_similarity:.4f}')
plt.title(f'Similarity Distribution for User {first_user_id}')
plt.xlabel('Similarity Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt


first_user_id = merged_train['user_id'].iloc[100]
first_user_data = merged_train[merged_train['user_id'] == first_user_id]

similarity_scores = first_user_data['similarity']
mean_similarity = similarity_scores.mean()
plt.figure(figsize=(10, 6))
plt.hist(similarity_scores, bins=20, edgecolor='k', alpha=0.7)
plt.axvline(mean_similarity, color='r', linestyle='dashed', linewidth=2, label=f'Mean: {mean_similarity:.4f}')
plt.title(f'Similarity Distribution for User {first_user_id}')
plt.xlabel('Similarity Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt


first_user_id = merged_train['user_id'].iloc[1000]
first_user_data = merged_train[merged_train['user_id'] == first_user_id]

similarity_scores = first_user_data['similarity']
mean_similarity = similarity_scores.mean()
plt.figure(figsize=(10, 6))
plt.hist(similarity_scores, bins=20, edgecolor='k', alpha=0.7)
plt.axvline(mean_similarity, color='r', linestyle='dashed', linewidth=2, label=f'Mean: {mean_similarity:.4f}')
plt.title(f'Similarity Distribution for User {first_user_id}')
plt.xlabel('Similarity Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
similarity_scores = merged_train['similarity']
mean_similarity = similarity_scores.mean()

plt.figure(figsize=(10, 6))
plt.hist(similarity_scores, bins=20, edgecolor='k', alpha=0.7)
plt.axvline(mean_similarity, color='r', linestyle='dashed', linewidth=2, label=f'Mean: {mean_similarity:.4f}')
plt.title('Similarity Distribution for All Users')
plt.xlabel('Similarity Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Top-N Recommendation Method

In [None]:
import numpy as np
test_user_ids = grouped_df['user_id'].sample(n=200, random_state=42,replace = False)
test_user_ids

In [None]:
all_songs = tfidf.transform(song_raw['text1'])

In [None]:
top_N = [100,50,30,20]
correct = [0,0,0,0]
total_rcmd = [0,0,0,0]
total_collect = [0,0,0,0]
for id in tqdm(test_user_ids, desc="Processing users"):
    user_collect = train_df[train_df['user_id'] == id]
    user_notcollect = song_raw[~song_raw['text'].isin(user_collect['text'])].copy()
    notcollect_indices = np.where(~song_raw['text'].isin(user_collect['text']))[0]
    profile = tfidf.transform([grouped_df[grouped_df['user_id'] == id]['text1'].values[0]])
    songs = all_songs[notcollect_indices]
    similarity_score = cosine_similarity(songs, profile).flatten()
    user_notcollect['similarity'] = similarity_score
    recommended_songs = user_notcollect.sort_values(by='similarity', ascending=False)
    user_test = test_df[test_df['user_id'] == id]
    
    recommended_songs['actual'] = recommended_songs.apply(
        lambda row: 1 if row['text'] in user_test['text'].values else 0, axis=1
    )
    for n in range(len(top_N)):
        recommended_songs = recommended_songs.head(top_N[n])
        correct[n] += recommended_songs['actual'].sum()
        total_rcmd[n] += top_N[n]
        total_collect[n] += len(user_test)

In [None]:
def result_output(top_N,correct,total_rcmd,total_collect):
    top_N = [100, 50, 30, 20]
    precision_list = []
    recall_list = []
    f1_score_list = []
    for n in range(len(top_N)):
        precision = correct[n] / total_rcmd[n] if total_rcmd[n] > 0 else 0
        recall = correct[n] / total_collect[n] if total_collect[n] > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_list.append(precision)
        recall_list.append(recall)
        f1_score_list.append(f1_score)
        
        print(f"\nTop N = {top_N[n]}")
        print(f"Precision (how many songs liked by user in all recommended): {precision}")
        print(f"Recall (how many songs collected by user been recommended): {recall}")
        print(f"F1-Score: {f1_score}")
    plt.figure(figsize=(10, 6))

    plt.plot(top_N, precision_list, marker='o', label='Precision')
    plt.plot(top_N, recall_list, marker='s', label='Recall')
    plt.plot(top_N, f1_score_list, marker='^', label='F1-Score')
    
    plt.xlabel('Top N')
    plt.ylabel('Score')
    plt.title('Precision, Recall, and F1-Score for different Top N')
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
result_output(top_N,correct,total_rcmd,total_collect)

# Recommend base on similarity distrubution

## Top-N closest to global mean similarity

In [None]:
similarity_scores = merged_train['similarity']
global_mean_similarity = similarity_scores.mean()
global_mean_similarity

In [None]:
top_N = [100,50,30,20]
correct = [0,0,0,0]
total_rcmd = [0,0,0,0]
total_collect = [0,0,0,0]
for id in tqdm(test_user_ids, desc="Processing users"):
    user_collect = train_df[train_df['user_id'] == id]
    user_notcollect = song_raw[~song_raw['text'].isin(user_collect['text'])].copy()
    notcollect_indices = np.where(~song_raw['text'].isin(user_collect['text']))[0]
    profile = tfidf.transform([grouped_df[grouped_df['user_id'] == id]['text1'].values[0]])
    songs = all_songs[notcollect_indices]
    similarity_score = cosine_similarity(songs, profile).flatten()
    user_notcollect['similarity'] = similarity_score
    user_notcollect['dist_to_mean'] = abs(user_notcollect['similarity']-global_mean_similarity)
    recommended_songs = user_notcollect.sort_values(by='dist_to_mean', ascending=True)
    user_test = test_df[test_df['user_id'] == id]
    
    recommended_songs['actual'] = recommended_songs.apply(
        lambda row: 1 if row['text'] in user_test['text'].values else 0, axis=1
    )
    for n in range(len(top_N)):
        recommended_songs = recommended_songs.head(top_N[n])
        correct[n] += recommended_songs['actual'].sum()
        total_rcmd[n] += top_N[n]
        total_collect[n] += len(user_test)

In [None]:
result_output(top_N,correct,total_rcmd,total_collect)

## Top-N closest to local(personal) mean similarity

In [None]:
top_N = [100,50,30,20]
correct = [0,0,0,0]
total_rcmd = [0,0,0,0]
total_collect = [0,0,0,0]
for id in tqdm(test_user_ids, desc="Processing users"):
    user_collect = train_df[train_df['user_id'] == id]
    user_notcollect = song_raw[~song_raw['text'].isin(user_collect['text'])].copy()
    notcollect_indices = np.where(~song_raw['text'].isin(user_collect['text']))[0]
    profile = tfidf.transform([grouped_df[grouped_df['user_id'] == id]['text1'].values[0]])
    songs = all_songs[notcollect_indices]
    similarity_score = cosine_similarity(songs, profile).flatten()
    mean_similarity = grouped_df[grouped_df['user_id'] == id]['mean'].values[0]
    user_notcollect['similarity'] = similarity_score
    user_notcollect['dist_to_mean'] = abs(user_notcollect['similarity']-mean_similarity)
    recommended_songs = user_notcollect.sort_values(by='dist_to_mean', ascending=True)
    user_test = test_df[test_df['user_id'] == id]
    
    recommended_songs['actual'] = recommended_songs.apply(
        lambda row: 1 if row['text'] in user_test['text'].values else 0, axis=1
    )
    for n in range(len(top_N)):
        recommended_songs = recommended_songs.head(top_N[n])
        correct[n] += recommended_songs['actual'].sum()
        total_rcmd[n] += top_N[n]
        total_collect[n] += len(user_test)


In [None]:
result_output(top_N,correct,total_rcmd,total_collect)

# User base recommendation

In [None]:

top_N = [100,50,30,20]
correct = [0,0,0,0]
total_rcmd = [0,0,0,0]
total_collect = [0,0,0,0]

In [None]:
all_profiles = tfidf.transform(grouped_df['text1'])
for id in tqdm(test_user_ids, desc="Processing users"):
    profile_index = grouped_df[grouped_df['user_id'] == id].index[0]
    profile = all_profiles[profile_index]
    other_users = grouped_df[~(grouped_df['user_id'] == id)].copy()
    other_profiles = all_profiles[[i for i in range(all_profiles.shape[0]) if i != profile_index]]
    similarity_score = cosine_similarity(other_profiles, profile).flatten()
    other_users['similarity'] = similarity_score
    other_users = other_users.sort_values(by='similarity', ascending=False)
    recommended_songs = pd.DataFrame()
    user_collect = train_df[train_df['user_id'] == id]
    user_test = test_df[test_df['user_id'] == id]
    added = [False,False,False,False]
    for oid in other_users['user_id'].unique():
        other_collect = train_df[train_df['user_id'] == oid].copy()
        user_notcollect = other_collect[~other_collect['text'].isin(user_collect['text'])]
        if len(recommended_songs) > 0:
            user_notcollect = user_notcollect[~user_notcollect['text'].isin(recommended_songs['text'])]
        recommended_songs = pd.concat([recommended_songs, user_notcollect])
        for n in range(len(top_N)):
            if len(recommended_songs) >= top_N[n] and not added[n] :
                recommended_songs = recommended_songs.head(top_N[n])
                recommended_songs['actual'] = recommended_songs.apply(
                    lambda row: 1 if row['text'] in user_test['text'].values else 0, axis=1
                )
                correct[n] += recommended_songs['actual'].sum()
                total_rcmd[n] += top_N[n]
                total_collect[n] += len(user_test)
                added[n] = True
                #print(f"User {id}, top_N {top_N[n]}: Correct = {correct[n]}, Total Rcmd = {total_rcmd[n]}, Total Collect = {total_collect[n]}")
                break
        finished = True
        for add in added:
            if not add:
                finished = False
        if finished:
            break


In [None]:
result_output(top_N,correct,total_rcmd,total_collect)

## Matrix decomposition

In [None]:
train_df['label'] = 1

In [None]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD, accuracy
from sklearn.metrics import f1_score, precision_score, recall_score
import random

user_ids = train_df['user_id'].unique()
song_ids = song_raw['song'].unique()

user_id_mapping = {id: i for i, id in enumerate(user_ids)}
song_id_mapping = {id: i for i, id in enumerate(song_ids)}

train_df['user_id_idx'] = train_df['user_id'].map(user_id_mapping)
train_df['song_id_idx'] = train_df['trackname'].map(song_id_mapping)

# negative_samples = []
# for user_id in tqdm(user_ids, desc="Processing users"):
#     user_idx = user_id_mapping[user_id]
#     positive_songs = train_df[train_df['user_id'] == user_id]['trackname'].values
#     positive_song_indices = [song_id_mapping[song] for song in positive_songs]
#     all_song_indices = list(set(range(len(song_ids))))
#     negative_song_indices = list(set(all_song_indices) - set(positive_song_indices))
#     negative_sample_indices = random.sample(negative_song_indices, len(positive_song_indices))
    
#     for song_idx in negative_sample_indices:
#         negative_samples.append([user_idx, song_idx, 0])

# negative_df = pd.DataFrame(negative_samples, columns=['user_id_idx', 'song_id_idx', 'label'])
# combined_df = pd.concat([train_df[['user_id_idx', 'song_id_idx', 'label']], negative_df])


In [None]:
reader = Reader(rating_scale=(0, 1))
train_data = Dataset.load_from_df(train_df[['user_id_idx', 'song_id_idx', 'label']], reader)

trainset = train_data.build_full_trainset()

algo = SVD()
algo.fit(trainset)

In [None]:
top_N = [100,50,30,20]
correct = [0,0,0,0]
total_rcmd = [0,0,0,0]
total_collect = [0,0,0,0]
for id in tqdm(test_user_ids, desc="Processing users"):
    user_test = test_df[test_df['user_id'] == id]
    user_collect = train_df[train_df['user_id'] == id]
    user_notcollect = song_raw[~song_raw['text'].isin(user_collect['text'])].copy()
    user_notcollect['label'] = user_notcollect.apply(
        lambda row: 1 if row['text'] in user_test['text'].values else 0, axis=1
    )
    user_notcollect['user_id'] = id
    user_notcollect['user_id_idx'] = user_notcollect['user_id'].map(user_id_mapping)
    user_notcollect['song_id_idx'] = user_notcollect['song'].map(song_id_mapping)

    testset = list(zip(user_notcollect['user_id_idx'], user_notcollect['song_id_idx'], user_notcollect['label']))
    predictions = algo.test(testset)
    scores = [est for (_, _, _, est, _) in predictions]
    user_notcollect['score'] = scores
    recommended_songs = user_notcollect.sort_values(by='score', ascending=False)
    for n in range(len(top_N)):
        recommended_songs = recommended_songs.head(top_N[n])
        correct[n] += recommended_songs['label'].sum()
        total_rcmd[n] += top_N[n]
        total_collect[n] += len(user_test)

In [None]:
result_output(top_N,correct,total_rcmd,total_collect)

In [None]:
from surprise import KNNBasic
algo = KNNBasic()
algo.fit(trainset)

In [None]:
top_N = [100,50,30,20]
correct = [0,0,0,0]
total_rcmd = [0,0,0,0]
total_collect = [0,0,0,0]
for id in tqdm(test_user_ids, desc="Processing users"):
    user_test = test_df[test_df['user_id'] == id]
    user_collect = train_df[train_df['user_id'] == id]
    user_notcollect = song_raw[~song_raw['text'].isin(user_collect['text'])].copy()
    user_notcollect['label'] = user_notcollect.apply(
        lambda row: 1 if row['text'] in user_test['text'].values else 0, axis=1
    )
    user_notcollect['user_id'] = id
    user_notcollect['user_id_idx'] = user_notcollect['user_id'].map(user_id_mapping)
    user_notcollect['song_id_idx'] = user_notcollect['song'].map(song_id_mapping)

    testset = list(zip(user_notcollect['user_id_idx'], user_notcollect['song_id_idx'], user_notcollect['label']))
    predictions = algo.test(testset)
    scores = [est for (_, _, _, est, _) in predictions]
    user_notcollect['score'] = scores
    recommended_songs = user_notcollect.sort_values(by='score', ascending=False)
    for n in range(len(top_N)):
        recommended_songs = recommended_songs.head(top_N[n])
        correct[n] += recommended_songs['label'].sum()
        total_rcmd[n] += top_N[n]
        total_collect[n] += len(user_test)

In [None]:
result_output(top_N,correct,total_rcmd,total_collect)

In [None]:
from surprise import BaselineOnly
bsl_options = {
    "method": "sgd",
    "learning_rate": 0.00005,
}
algo = BaselineOnly(bsl_options=bsl_options)
algo.fit(trainset)

In [None]:
top_N = [100,50,30,20]
correct = [0,0,0,0]
total_rcmd = [0,0,0,0]
total_collect = [0,0,0,0]
for id in tqdm(test_user_ids, desc="Processing users"):
    user_test = test_df[test_df['user_id'] == id]
    user_collect = train_df[train_df['user_id'] == id]
    user_notcollect = song_raw[~song_raw['text'].isin(user_collect['text'])].copy()
    user_notcollect['label'] = user_notcollect.apply(
        lambda row: 1 if row['text'] in user_test['text'].values else 0, axis=1
    )
    user_notcollect['user_id'] = id
    user_notcollect['user_id_idx'] = user_notcollect['user_id'].map(user_id_mapping)
    user_notcollect['song_id_idx'] = user_notcollect['song'].map(song_id_mapping)

    testset = list(zip(user_notcollect['user_id_idx'], user_notcollect['song_id_idx'], user_notcollect['label']))
    predictions = algo.test(testset)
    scores = [est for (_, _, _, est, _) in predictions]
    user_notcollect['score'] = scores
    recommended_songs = user_notcollect.sort_values(by='score', ascending=False)
    for n in range(len(top_N)):
        recommended_songs = recommended_songs.head(top_N[n])
        correct[n] += recommended_songs['label'].sum()
        total_rcmd[n] += top_N[n]
        total_collect[n] += len(user_test)

In [None]:
result_output(top_N,correct,total_rcmd,total_collect)

In [None]:
bsl_options = {"method": "als", "n_epochs": 5, "reg_u": 12, "reg_i": 5}
algo = BaselineOnly(bsl_options=bsl_options)
algo.fit(trainset)

In [None]:
top_N = [100,50,30,20]
correct = [0,0,0,0]
total_rcmd = [0,0,0,0]
total_collect = [0,0,0,0]
for id in tqdm(test_user_ids, desc="Processing users"):
    user_test = test_df[test_df['user_id'] == id]
    user_collect = train_df[train_df['user_id'] == id]
    user_notcollect = song_raw[~song_raw['text'].isin(user_collect['text'])].copy()
    user_notcollect['label'] = user_notcollect.apply(
        lambda row: 1 if row['text'] in user_test['text'].values else 0, axis=1
    )
    user_notcollect['user_id'] = id
    user_notcollect['user_id_idx'] = user_notcollect['user_id'].map(user_id_mapping)
    user_notcollect['song_id_idx'] = user_notcollect['song'].map(song_id_mapping)

    testset = list(zip(user_notcollect['user_id_idx'], user_notcollect['song_id_idx'], user_notcollect['label']))
    predictions = algo.test(testset)
    scores = [est for (_, _, _, est, _) in predictions]
    user_notcollect['score'] = scores
    recommended_songs = user_notcollect.sort_values(by='score', ascending=False)
    for n in range(len(top_N)):
        recommended_songs = recommended_songs.head(top_N[n])
        correct[n] += recommended_songs['label'].sum()
        total_rcmd[n] += top_N[n]
        total_collect[n] += len(user_test)

In [None]:
result_output(top_N,correct,total_rcmd,total_collect)