In [None]:
from pathlib import Path

import scipy
import pandas as pd
import implicit

In [None]:
column_names = ['user_id', 'song_id', 'play_count']
triplet_df = pd.read_csv('data/train_triplets.txt', sep='\t', names=column_names)
triplet_df

In [None]:
triplet_df = triplet_df[triplet_df.play_count > 1]

In [None]:
triplet_df

In [None]:
track_column_names = ['track_id', 'song_id', 'artist', 'title']
unique_tracks_df = pd.read_csv('data/p02_unique_tracks.txt', sep='\s*<SEP>\s*', names=track_column_names, engine='python')
unique_tracks_df

In [None]:
unique_tracks_df.drop_duplicates(['song_id'], inplace=True)
len(triplet_df), len(unique_tracks_df)

In [None]:
df = pd.merge(triplet_df, unique_tracks_df, on='song_id', how='left')
df

In [None]:
df['song'] = df['title']+' - '+df['artist']
df

In [None]:
df = df[['user_id', 'song_id', 'track_id', 'song', 'play_count']]
df

In [None]:
# df.to_csv('songs_filtred.csv')
songs = pd.read_csv('songs_filtred.csv')
# songs

In [None]:
songs['user_idx'] = pd.factorize(songs['user_id'])[0]
songs['song_idx'] = pd.factorize(songs['song_id'])[0]
songs

In [None]:
class MusicData:
    def __init__(self, data):
        self.data = data
        self.song_id_to_name = pd.Series(data.song.values, index=data.song_id).to_dict()

    def get_user_songs(self, user_id):
        user_data = self.data[self.data['user_id'] == user_id]
        user_songs = [self.song_id_to_name[song_id] for song_id in user_data['song_id'].unique()]
        return user_songs

    def get_song_users(self, song_id):
        song_data = self.data[self.data['song_id'] == song_id]
        song_users = song_data['user_id'].unique()
        return song_users

    def get_song_name(self, song_id):
        return self.song_id_to_name.get(song_id, "Song ID not found in data")

    def get_top_songs(self, n=10):
        top_songs = self.data['song_id'].value_counts()[:n].index.tolist()
        return top_songs

    def get_top_users(self, n=10):
        top_users = self.data['user_id'].value_counts()[:n].index.tolist()
        return top_users

music_data = MusicData(songs)

In [None]:
X = songs[['user_idx', 'song_idx', 'play_count']]
X

In [None]:
X.user_idx.nunique(), X.song_idx.nunique(), 

In [None]:
userId = 'b7815dbb206eb2831ce0fe040d0aa537e2e800f7'
print("Songs listened by a user:", music_data.get_user_songs(userId))

In [None]:
songId = 'SOWYSKH12AF72A303A'
print("Song name for a song ID:", music_data.get_song_name(song_id=songId)) 

In [None]:
X.set_index(["user_idx", "song_idx"], inplace=True)

In [None]:
X.head(30)

In [None]:
X.index.get_level_values(0)[:30]

In [None]:
X.index.get_level_values(1)

In [None]:
coo = scipy.sparse.coo_matrix( (X.play_count.astype(float), (X.index.get_level_values(0), X.index.get_level_values(1),),))

In [None]:
implict_model = implicit.als.AlternatingLeastSquares(
    factors=50, iterations=10, regularization=0.01
)

In [None]:
implict_model.fit(coo.tocsr())

In [None]:
userId = 'b7815dbb206eb2831ce0fe040d0aa537e2e800f7'
user_id = songs[songs.user_id == userId].user_idx.values[0]
n = 10000
songs_ids, scores = implict_model.recommend(user_id, coo.tocsr()[n], N=n)

In [None]:
songs_ids[:10]

In [None]:
scores[:10]

In [None]:
songs_ids = songs[songs.song_idx.isin(songs_ids)].song_id.unique()
[music_data.get_song_name(song_id) for song_id in songs_ids[:10]]

In [None]:
music_data.get_user_songs(userId)

In [None]:
itemids, scores = implict_model.similar_items(itemid=118)
itemids

In [None]:
itemids = songs[songs.song_idx.isin(itemids)].song_id.unique()
itemids

In [None]:
[music_data.get_song_name(item_id) for item_id in itemids]