In [1]:
import pandas as pd
import polars as pl
from pathlib import Path
import numpy as np
import scipy
import implicit

import implicit
from scipy.sparse import coo_matrix
import scipy.sparse
from sklearn.model_selection import train_test_split

One very common problem to solve is when you have a number of users and a number of products, and you want to recommend which products are most likely to be useful for which users. There are many variations of this: for example, recommending movies (such as on Netflix), figuring out what to highlight for a user on a home page, deciding what stories to show in a social media feed, and so forth. There is a general solution to this problem, called collaborative filtering, which works like this: look at what products the current user has used or liked, find other users that have used or liked similar products, and then recommend other products that those users have used or liked.

## 1. Non-personalized approach.

### 1.1 Top-250 tracks

In [None]:
import polars as pl

# Load and prepare the data
triplet_columns = ['user_id', 'song_id', 'play_count']
track_columns = ['track_id', 'song_id', 'artist', 'title']

# Read datasets
triplet_df = pl.read_csv('data/train_triplets.txt', separator='\t', new_columns=triplet_columns, use_pyarrow=True)
unique_tracks_df = pl.read_csv('data/p02_unique_tracks.csv', new_columns=track_columns, use_pyarrow=True)

# Aggregate and join data
song_play_counts = triplet_df.group_by('song_id').agg(pl.sum('play_count').alias('play_count')).sort('play_count', descending=True).limit(250)
top_250_tracks = song_play_counts.join(unique_tracks_df, on='song_id').select('artist', 'title', 'play_count').with_row_index(name='index number')
top_250_tracks = top_250_tracks.sort('play_count', descending=True)

apply .head(5) to the resulting dataframe of the top-250 tracks, the result should be exactly like this:
artist title play_count
- 0 Dwight Yoakam You're The One 726885
- 1 Björk Undo 648239
- 2 Kings Of Leon Revelry 527893
- 3 Harmonia Sehr kosmisch 425463
- 4 Barry Tuckwell/Academy of St Martin-in-the-Fie... Horn Concerto No. 4 in E flat K495: II. Romanc... 389880

In [None]:
top_250_tracks.head(5)

Apply .tail(5) to the resulting dataframe of the top-250 tracks, the result
should be exactly like this:
artist title play_count
- 245 Triple Six Mafia Now I'm High_ Really High 35253
- 246 The Red Jumpsuit Apparatus Face Down (Album Version) 35245
- 247 Linkin Park New Divide (Album Version) 35191
- 248 Selena Gomez & The Scene Naturally 35074
- 249 Creedence Clearwater Revival Have You Ever Seen The Rain 34831



In [None]:
top_250_tracks.tail(5)

### 1.2 Top-100 tracks by genre

In [None]:
genre_column_names = ['track_id', 'majority_genre', 'minority_genre']
tagtraum_genre_df = pd.read_csv('data/p02_msd_tagtraum_cd2.cls', sep='\t', comment='#', names=genre_column_names)
tagtraum_genre_df.drop(columns=['minority_genre'], axis=1, inplace=True)
tagtraum_genre_df

In [None]:
triplet_df

In [None]:
merged_df_genre = pd.merge(pd.merge(tagtraum_genre_df, unique_tracks_df.to_pandas(), on='track_id'), triplet_df.to_pandas(), on='song_id')

In [None]:
merged_df_genre

In [None]:
def get_top_and_bottom_tracks(merged_df_genre, selected_genre):
    # Filter by the specified genre
    genre_subset = merged_df_genre[merged_df_genre['majority_genre'] == selected_genre]

    # Aggregate play counts for each track in the selected genre
    track_play_counts = genre_subset.groupby(['artist', 'title'])['play_count'].sum()

    # Sort tracks by play count in descending order
    sorted_tracks = track_play_counts.sort_values(ascending=False).head(100)

    # Get the top 5 and bottom 5 tracks
    top_tracks = sorted_tracks.head(5)
    bottom_tracks = sorted_tracks.tail(5)

    return top_tracks, bottom_tracks

#### Rock

Apply .head(5) to the resulting dataframe of the top-100 tracks
- for the genre, the result should be exactly like this:
- artist title play_count
- 0 Björk Undo 648239
- 1 Kings Of Leon Revelry 527893
- 2 Harmonia Sehr kosmisch 425463
- 3 OneRepublic Secrets 292642
- 4 Tub Ring Invalid 268353

Apply .tail(5) to the resulting dataframe of the top-100 tracks
for the genre, the result should be exactly like this:
- artist title play_count
- 95 Metric Gold Guns Girls 28148
- 96 Pearl Jam Encore Break 27579
- 97 Daughtry No Surprise 27187
- 98 Eric Clapton Tears In Heaven 26999
- 99 Nick Lowe All Men Are Liars 26683

In [None]:
selected_genre = 'Rock'

top_tracks, bottom_tracks = get_top_and_bottom_tracks(merged_df_genre, selected_genre)

print(f"Top 5 tracks for the genre: {selected_genre}")
top_tracks

In [None]:
print(f"\nBottom 5 tracks for the genre: {selected_genre}")
bottom_tracks

#### Rap

**Apply .head(5) to the resulting dataframe of the top-100 tracks
for the genre, the result should be exactly like this:**
artist title play_count
- 0 Alliance Ethnik Représente 241669
- 1 Beastie Boys The Maestro 72381
- 2 Eminem Without Me 63918
- 3 Black Eyed Peas Imma Be 62438
- 4 Kid Cudi Up Up & Away 59810

**Apply .tail(5) to the resulting dataframe of the top-100 tracks
for the genre, the result should be exactly like this:**
artist title play_count
- 95 Shwayze Buzzin' 7384
- 96 Orishas El Kilo 7324
- 97 Snoop Dogg Sexual Eruption 7171
- 98 Bone Thugs-N-Harmony Tha Crossroads 7124
- 99 Orishas Habana 6998

In [None]:
selected_genre = 'Rap'

top_tracks, bottom_tracks = get_top_and_bottom_tracks(merged_df_genre, selected_genre)

# Display the results
print(f"Top 5 tracks for the genre: {selected_genre}")
top_tracks

In [None]:
print(f"\nBottom 5 tracks for the genre: {selected_genre}")
bottom_tracks

#### Electronic

**Apply .head(5) to the resulting dataframe of the top-100 tracks
for the genre, the result should be exactly like this:**
artist title play_count
- 0 Southside Spinners Luvstruck 84225
- 1 The Black Keys Tighten Up 81179
- 2 Deadmau5 Ghosts 'n' Stuff (Original Instrumental Mix) 63951
- 3 Daft Punk Harder Better Faster Stronger 63170
- 4 Clara Hill Clara meets Slope - Hard To Say 58887

**Apply .tail(5) to the resulting dataframe of the top-100 tracks
for the genre, the result should be exactly like this:**
artist title play_count
- 95 Nicolette No Government 9541
- 96 Two Door Cinema Club Eat That Up_ It's Good For You 9524
- 97 Moby Why Does My Heart Feel So Bad? (2006 Digital R... 9491
- 98 Death In Vegas Girls 9490
- 99 Johan Gielen Flash 9431

In [None]:
selected_genre = 'Electronic'

top_tracks, bottom_tracks = get_top_and_bottom_tracks(merged_df_genre, selected_genre)

# Display the results
print(f"Top 5 tracks for the genre: {selected_genre}")
top_tracks

In [None]:
print(f"\nBottom 5 tracks for the genre: {selected_genre}")
bottom_tracks

## 2. Collections

It should return on a given keyword (love, war, happiness) a dataframe (50tracks) with the following fields: index number, artist name, track title, play count. The table should be sorted by the play count descendingly. Try

### 1. baseline

In [None]:
mxm_dataset_path = 'data/mxm_dataset_train.txt'
merged_songs_path = 'data/songs.csv'


if not Path('data/songs.csv').exists():
    print('Merging songs...')
    triplet_columns = ['user_id', 'song_id', 'play_count']
    track_columns = ['track_id', 'song_id', 'artist', 'title']
    
    triplet_df = pl.read_csv('data/train_triplets.txt', separator='\t', new_columns=triplet_columns, use_pyarrow=True)
    unique_tracks_df = pl.read_csv('data/p02_unique_tracks.csv', new_columns=track_columns)
    triplet_df = triplet_df.group_by('song_id').agg(pl.sum('play_count').alias('play_count')).sort('play_count', descending=True)
    mergerd_songs = triplet_df.join(unique_tracks_df, on='song_id', how='left').select('track_id', 'artist', 'title', 'play_count')
    mergerd_songs.write_csv('data/songs.csv')
else:
    print('Reading songs...')
    mergerd_songs = pl.read_csv(merged_songs_path, use_pyarrow=True)

In [None]:
mergerd_songs

In [None]:
class MXMDataLoader:
    def __init__(self, dataset_path_mxm, mergerd_songs):
        self.dataset_path = dataset_path_mxm
        self.songs = mergerd_songs
        
    def load(self):
        top_words = []
        filtered_tracks = []

        with open(self.dataset_path, 'r', encoding='utf-8') as file:
            for line in file:
                if line.startswith('#') or line.strip() == '':
                    continue
                elif line.startswith('%'):
                    top_words = line[1:].strip().split(',')
                else:
                    elements = line.strip().split(',')
                    track_id = elements[0]
                    word_counts = {int(count.split(':')[0]) - 1: int(count.split(':')[1]) for count in elements[2:]}
                    filtered_tracks.append((track_id, word_counts))
        self.top_words = top_words
        self.filtered_tracks = filtered_tracks
        
    def get_song_lyrics(self, track_id):
        def get_words(top_words, word_counts):
            return {top_words[index]:count for index, count in word_counts.items()}
            
        
        for track in self.filtered_tracks:
            if track[0] == track_id:
                return get_words(self.top_words, track[1])
        raise ValueError(f"Track ID '{track_id}' not found in the dataset.")
    
    def get_sorted_tracks_by_keyword(self, keyword, threshold):
        
        try:
            keyword_index = self.top_words.index(keyword)
        except ValueError:
            print(f"Keyword '{keyword}' not found in the dataset.")
            return
            
        filtered_tracks = []
        for idx, (track_id, word_counts) in enumerate(self.filtered_tracks): 
            keyword_count = word_counts.get(keyword_index, 0)
            if keyword_count >= threshold:
                row_df = self.songs.filter(pl.col('track_id') == track_id)
                if len(row_df) > 0:
                    _ , artist, title, play_count = row_df[0].row(0)
                    filtered_tracks.append((idx, track_id, artist, title, play_count, keyword_count))
        print("Done ✅ filtering tracks by keyword.")    
        filtered_tracks_df = pl.DataFrame(filtered_tracks, schema=['index_number', 'track_id' ,'artist', 'title', 'play_count', 'keyword_count']).sort('play_count', descending=True).head(50)
        return filtered_tracks_df

In [None]:
mxm_loader = MXMDataLoader(mxm_dataset_path, mergerd_songs)
mxm_loader.load()

In [None]:
# Try ‘love’ as the keyword and look through the lyrics of 3 random tracks given in the list of recommendations – do they have ‘love’ in the lyrics?
tracks = mxm_loader.get_sorted_tracks_by_keyword('life', 6)
tracks

In [None]:
print(tracks)

In [None]:
mxm_loader.get_song_lyrics('TRJRECT12903CBADA3')['love']

In [None]:
import random
idx = random.randint(0, len(tracks) - 1)
track_id = 'TROHFJK12903CC4BCE'
print(f"Track ID: {track_id}")
print(f"Lyrics: {mxm_loader.get_song_lyrics(track_id)}")


In [None]:
# If an incorrect keyword is given, the exception is handled
incorrect_keyword_tracks = mxm_loader.get_sorted_tracks_by_keyword('incorrect_keyword', 6)
incorrect_keyword_tracks

### 2. word2vec

In [None]:
import gensim.downloader as api
model_name = 'glove-wiki-gigaword-300'
wv = api.load(model_name)

In [None]:
try:
    vec_love = wv['love']
except KeyError:
    print("The word 'love' does not appear in this model")
vec_love

In [None]:
class MXMDataLoaderW2V:
    def __init__(self, dataset_path_mxm, mergerd_songs, word_vectors):
        self.dataset_path = dataset_path_mxm
        self.songs = mergerd_songs
        self.word_vectors = word_vectors
        self.top_words = []
        self.filtered_tracks = []

    def load(self):
        with open(self.dataset_path, 'r', encoding='utf-8') as file:
            for line in file:
                if line.startswith('#') or line.strip() == '':
                    continue
                elif line.startswith('%'):
                    self.top_words = line[1:].strip().split(',')
                else:
                    elements = line.strip().split(',')
                    track_id = elements[0]
                    word_counts = {int(count.split(':')[0]) - 1 : int(count.split(':')[1]) for count in elements[2:]}
                    self.filtered_tracks.append((track_id, word_counts))

    def get_similar_keywords(self, keyword, top_n=5):
        """Get top_n similar words to the given keyword."""
        try:
            similar_words = self.word_vectors.most_similar(positive=[keyword], topn=top_n)
            return [keyword] + [word for word, _ in similar_words]  # Include the keyword itself
        except KeyError:
            print(f"Keyword '{keyword}' not found in the word2vec model.")
            return [keyword]

    def get_sorted_tracks_by_keyword(self, keyword, threshold, max_tracks=50):
        similar_keywords = self.get_similar_keywords(keyword)
        print(f"Similar keywords to '{keyword}': {similar_keywords}")
        similar_keyword_indices = [self.top_words.index(word) for word in similar_keywords if word in self.top_words]

        filtered_tracks = []
        for track_id, word_counts in self.filtered_tracks:
            total_count = sum(word_counts.get(idx, 0) for idx in similar_keyword_indices[:5])
            if total_count >= threshold:
                row_df = self.songs.filter(pl.col('track_id') == track_id)
                if len(row_df) > 0:
                    _ , artist, title, play_count = row_df[0].row(0)
                    filtered_tracks.append((idx, track_id, artist, title, play_count, total_count))
        print("Done ✅ filtering tracks by keyword.")    
        filtered_tracks_df = pl.DataFrame(filtered_tracks, schema=['index_number', 'track_id' ,'artist', 'title', 'play_count', 'keyword_count']).sort('play_count', descending=True).head(50)
                
        return filtered_tracks_df

In [None]:
# Try ‘love’ as the keyword and look through the lyrics of 3 random tracks given in the list of recommendations – do they have ‘love’ in the lyrics?
mxm_loader = MXMDataLoaderW2V(mxm_dataset_path, mergerd_songs, wv)
mxm_loader.load()

In [None]:
tracks = mxm_loader.get_sorted_tracks_by_keyword('happy', 6, max_tracks=50)
print(tracks)

### 3. Classification Task

In [None]:
import polars as pl

class MXMDataLoaderClassification:
    def __init__(self, dataset_path_mxm, merged_songs, wv):
        self.dataset_path = dataset_path_mxm
        self.songs = merged_songs
        self.top_words = []
        self.filtered_tracks = []
        self.word_vectors = wv

    def load(self):
        with open(self.dataset_path, 'r', encoding='utf-8') as file:
            for line in file:
                if line.startswith('#') or line.strip() == '':
                    continue
                elif line.startswith('%'):
                    self.top_words = line[1:].strip().split(',')
                else:
                    elements = line.strip().split(',')
                    track_id = elements[0]
                    word_counts = {int(count.split(':')[0]) - 1: int(count.split(':')[1]) for count in elements[2:]}
                    self.filtered_tracks.append((track_id, word_counts))

    def classify_tracks_by_keywords(self, keywords):
        tracks_classification = []
        # Create a dictionary to find indices of keywords in top_words
        keyword_indices = {keyword: self.top_words.index(keyword) for keyword in keywords if keyword in self.top_words}

        for track_id, word_counts in self.filtered_tracks:
            keyword_presence = {keyword: word_counts.get(idx, 0) for keyword, idx in keyword_indices.items()}

            # Determine the most prevalent keyword based on counts
            if keyword_presence:
                most_prevalent_keyword = max(keyword_presence, key=keyword_presence.get)
                max_count = keyword_presence[most_prevalent_keyword]
                if max_count > 0:  # Only consider tracks where the keyword count is greater than zero
                    row_df = self.songs.filter(pl.col('track_id') == track_id)
                    if len(row_df) > 0:
                        _, artist, title, play_count = row_df[0].row(0)
                        tracks_classification.append((track_id, artist, title, play_count, most_prevalent_keyword))

        # Convert to DataFrame
        classification_df = pl.DataFrame(tracks_classification, schema=['track_id', 'artist', 'title', 'play_count', 'label'])
        self.classification_df = classification_df
        return classification_df

In [None]:
keywords = ["love", "war", "happiness", "loneliness", "money"]
loader = MXMDataLoaderClassification(mxm_dataset_path, mergerd_songs)
loader.load()

In [None]:
classified_tracks = loader.classify_tracks_by_keywords(keywords)
classified_tracks

In [None]:
classified_tracks['label'].value_counts()

In [None]:
import polars as pl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report



class MXMDataLoaderClassification:
    def __init__(self, dataset_path_mxm):
        self.dataset_path = dataset_path_mxm
        self.top_words = []
        self.filtered_tracks = []

    def load(self):
        with open(self.dataset_path, 'r', encoding='utf-8') as file:
            for line in file:
                if line.startswith('#') or line.strip() == '':
                    continue
                elif line.startswith('%'):
                    self.top_words = line[1:].strip().split(',')
                else:
                    elements = line.strip().split(',')
                    track_id = elements[0]
                    word_counts = {int(count.split(':')[0]) - 1: int(count.split(':')[1]) for count in elements[2:]}
                    self.filtered_tracks.append((track_id, word_counts))
    
    def train(self):
        X = []
        y = []

        for track_id, word_counts in self.filtered_tracks:
            word_counts_vec = [word_counts.get(idx, 0) for idx in range(len(loader.top_words))]
            X.append(word_counts_vec[:728])
            label = self.labeled_tracks.filter(pl.col('track_id') == track_id)['label'].item(0)
            y.append(label)
            
        tfidf_transformer = TfidfTransformer()
        tfidf_matrix = tfidf_transformer.fit_transform(X)

        X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.2, random_state=42)
        print("Training and testing data shapes:",  X_train.shape, X_test.shape, len(y_train), len(y_test))

        classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        classifier.fit(X_train, y_train)

        y_pred = classifier.predict(X_test)


        print(classification_report(y_test, y_pred))
        
        return classifier

    def label_tracks(self, categories):
        track_labels = []

        # Map keywords to their indices for quick lookup
        keyword_to_index = {word: idx for idx, word in enumerate(self.top_words)}

        for track_id, word_counts in self.filtered_tracks:
            category_counts = {category: 0 for category in categories}

            # Accumulate counts for each category based on associated keywords
            for category, keywords in categories.items():
                for keyword in keywords:
                    idx = keyword_to_index.get(keyword)
                    if idx is not None:
                        category_counts[category] += word_counts.get(idx, 0)

            # Determine the category with the highest count
            if category_counts:
                dominant_category = max(category_counts, key=category_counts.get)
                track_labels.append((track_id, dominant_category))

        labeled_df = pl.DataFrame(track_labels, schema=['track_id', 'label'])
        self.labeled_tracks = labeled_df
        return labeled_df

In [None]:
loader = MXMDataLoaderClassification(mxm_dataset_path)
loader.load()
categories = {
    "love": ["love", "heart"],
    "war": ["war", "battle"],
    "money": ["money", "cash"],
    "loneliness": ["lonely", "alone"]
}
labeled_tracks = loader.label_tracks(categories)
print(labeled_tracks)

In [None]:
loader.train()

## 3. People similar to you listen

In [None]:
triplet_df

In [None]:
unique_tracks_df

In [2]:
if not Path('data/songs_filtered.csv').exists():
    
    print('Merging songs...')
    songs = pd.merge(triplet_df.to_pandas(), unique_tracks_df.to_pandas(), on='song_id', how='left')
    songs['song'] = songs['title']+' - ' + songs['artist']
    songs = songs[['user_id', 'song_id', 'track_id', 'song', 'play_count']]
    songs.to_csv('data/songs_filtered.csv')
else:
    print('Reading songs...')
    songs = pd.read_csv('data/songs_filtered.csv')
    songs['user_idx'] = pd.factorize(songs['user_id'])[0]
    songs['song_idx'] = pd.factorize(songs['song_id'])[0]

songs

Reading songs...


Unnamed: 0,user_id,song_id,track_id,song,play_count,user_idx,song_idx
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,TRIRLYL128F42539D1,Nothing from Nothing - Billy Preston,1,0,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,TRMHBXZ128F4238406,Entre Dos Aguas - Paco De Lucia,2,0,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,TRYQMNI128F147C1C7,Under Cold Blue Stars - Josh Rouse,1,0,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,TRAHZNE128F9341B86,Riot Radio (Soundtrack Version) - The Dead 60s,1,0,3
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBNZDC12A6D4FC103,TRJPXGD128F92F17D7,Sin límites (I) - Amset,1,0,4
...,...,...,...,...,...,...,...
49664522,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,TRKUAEO128F933ABFC,We're Back - Eminem / Obie Trice / Stat Quo / ...,2,1019317,4979
49664523,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,TRRNFHH128F92D262D,Savior - Rise Against,1,1019317,1773
49664524,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,TRSLDDC12903CC36E7,OMG - Usher featuring will.i.am,1,1019317,219
49664525,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,TRNJQAM128F14557AF,Downfall (Album Version) - matchbox twenty,3,1019317,3086


In [3]:
class MusicData:
    def __init__(self, data):
        self.data = data
        self.song_id_to_name = pd.Series(data.song.values, index=data.song_id).to_dict()

    def get_user_songs(self, user_id):
        user_data = self.data[self.data['user_id'] == user_id]
        user_songs = [self.song_id_to_name[song_id] for song_id in user_data['song_id'].unique()]
        return user_songs

    def get_song_users(self, song_id):
        song_data = self.data[self.data['song_id'] == song_id]
        song_users = song_data['user_id'].unique()
        return song_users

    def get_song_name(self, song_id):
        return self.song_id_to_name.get(song_id, "Song ID not found in data")

    def get_top_songs(self, n=10):
        top_songs = self.data['song_id'].value_counts()[:n].index.tolist()
        return top_songs

    def get_top_users(self, n=10):
        top_users = self.data['user_id'].value_counts()[:n].index.tolist()
        return top_users
    
    
    def get_recommendations(self, user_id, n=10):
        user_songs = self.get_user_songs(user_id)
        all_songs = self.data['song_id'].unique()
        recommendations = [song_id for song_id in all_songs if song_id not in user_songs][:n]
        return recommendations


music_data = MusicData(songs)

In [4]:
X = songs[['user_idx', 'song_idx', 'play_count']]
X

Unnamed: 0,user_idx,song_idx,play_count
0,0,0,1
1,0,1,2
2,0,2,1
3,0,3,1
4,0,4,1
...,...,...,...
49664522,1019317,4979,2
49664523,1019317,1773,1
49664524,1019317,219,1
49664525,1019317,3086,3


In [5]:
X.set_index(["user_idx", "song_idx"], inplace=True)

In [6]:
X.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,play_count
user_idx,song_idx,Unnamed: 2_level_1
0,0,1
0,1,2
0,2,1
0,3,1
0,4,1
0,5,2
0,6,1
0,7,1
0,8,1
0,8,1


In [7]:
X.index.get_level_values(0)[:30]

Index([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0],
      dtype='int64', name='user_idx')

In [8]:
X.index.get_level_values(1)

Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
            8,
       ...
         5322,  13096, 142665,  23083,  70144,   4979,   1773,    219,   3086,
         1405],
      dtype='int64', name='song_idx', length=49664527)

In [9]:
coo = scipy.sparse.coo_matrix( (X.play_count.astype(float), (X.index.get_level_values(0), X.index.get_level_values(1),),))

In [10]:
implict_model = implicit.als.AlternatingLeastSquares(
    factors=50, iterations=10, regularization=0.01
)

  check_blas_config()


In [11]:
implict_model.fit(coo.tocsr())

  0%|          | 0/10 [00:00<?, ?it/s]

In [12]:
def recommend_songs(model, songs, user_id, matrix, n=10):
    
    user_idx = songs[songs.user_id == user_id].user_idx.values[0]
    # songs_ids, scores = implict_model.recommend(user_idx, coo.tocsr()[n], N=n)

    recommendations = model.recommend(user_idx, matrix, N=n)
    recommended_song_ids = [songs.loc[songs['song_idx'] == idx, 'song_id'].iat[0] for idx, _ in recommendations]
    return [music_data.song_name(song_id) for song_id in recommended_song_ids]

userId = 'b7815dbb206eb2831ce0fe040d0aa537e2e800f7'
print("Songs recommended for a user:", recommend_songs(implict_model, songs, userId, coo, n=10))

ValueError: user_items needs to be a CSR sparse matrix

In [1]:
def recommend_songs(model, songs, user_id, matrix, n=10):
    
    user_idx = songs[songs.user_id == user_id].user_idx.values[0]
    # songs_ids, scores = implict_model.recommend(user_idx, coo.tocsr()[n], N=n)

    songs_ids, scores = model.recommend(user_idx, matrix.tocsr()[n], N=n)
    songs_ids = songs[songs.song_idx.isin(songs_ids)].song_id.unique()
    recommended_songs = [music_data.get_song_name(song_id) for song_id in songs_ids]
    
    # check if the user has already listened to the recommended songs and remove them
    user_songs = music_data.get_user_songs(user_id)
    recommended_songs = [song for song in recommended_songs if song not in user_songs]
    return recommended_songs[:n]

In [None]:

# write a function like def recommend_songs(model, songs, user_id, matrix, n=10) that does the following:

# Try songId SOWYSKH12AF72A303A
# The recommendation list contains 10 tracks in the format described in the subject?
# The list should not contain the track that was given as the argument
# While training the model the average p@k was greater than 10%?


 
def recommend_songs_by_song_id(model, songs, song_id, matrix, n=10):
    song_idx = songs[songs.song_id == song_id].song_idx.values[0]    
    itemids, scores = implict_model.similar_items(itemid=118)
    itemids
    itemids = songs[songs.song_idx.isin(itemids)].song_id.unique()
    itemids
    [music_data.get_song_name(item_id) for item_id in itemids]


In [3]:
recommend_songs(implict_model, songs, 'b7815dbb206eb2831ce0fe040d0aa537e2e800f7', coo, n=10)

NameError: name 'implict_model' is not defined

In [None]:
userId = 'b7815dbb206eb2831ce0fe040d0aa537e2e800f7'
user_id = songs[songs.user_id == userId].user_idx.values[0]
n = 10
songs_ids, scores = implict_model.recommend(user_id, coo.tocsr()[n], N=n)

In [None]:
songs_ids[:10]

In [None]:
scores[:10]

In [None]:
songs_ids = songs[songs.song_idx.isin(songs_ids)].song_id.unique()
[music_data.get_song_name(song_id) for song_id in songs_ids[:10]]

In [None]:
music_data.get_user_songs(userId)

In [None]:
itemids, scores = implict_model.similar_items(itemid=118)
itemids

In [None]:
itemids = songs[songs.song_idx.isin(itemids)].song_id.unique()
itemids

In [None]:
[music_data.get_song_name(item_id) for item_id in itemids]

## Refacored Version

In [None]:
import pandas as pd
from pathlib import Path

import implicit
from scipy.sparse import coo_matrix
import scipy.sparse
from sklearn.model_selection import train_test_split

class MusicData:
    def __init__(self, dataframe):
        self.data = dataframe
        self.song_id_to_name = pd.Series(dataframe['song'].values, index=dataframe['song_id']).to_dict()

    def get_songs_for_user(self, user_id):
        user_songs = self.data[self.data['user_id'] == user_id]['song_id'].unique()
        return [self.song_id_to_name.get(song_id, "Song ID not found") for song_id in user_songs]

    def get_users_for_song(self, song_id):
        return self.data[self.data['song_id'] == song_id]['user_id'].unique()

    def song_name(self, song_id):
        return self.song_id_to_name.get(song_id, "Song ID not found in data")

    def top_songs(self, n=10):
        return self.data['song_id'].value_counts().head(n).index.tolist()

    def top_users(self, n=10):
        return self.data['user_id'].value_counts().head(n).index.tolist()

def prepare_data():
    path = Path('data/songs_filtered.csv')
    if not path.exists():
        songs = pd.merge(triplet_df.to_pandas(), unique_tracks_df.to_pandas(), on='song_id', how='left')
        songs['song'] = songs['title'] + ' - ' + songs['artist']
        songs = songs[['user_id', 'song_id', 'track_id', 'song', 'play_count']]
        songs.to_csv(path, index=False)
    else:
        songs = pd.read_csv(path)

    songs['user_idx'] = pd.factorize(songs['user_id'])[0]
    songs['song_idx'] = pd.factorize(songs['song_id'])[0]
    return songs


def create_sparse_matrix(data):
    """ Converts a DataFrame back to a sparse matrix. """
    ratings_coo_matrix = coo_matrix((data['play_count'], (data['user_idx'], data['song_idx'])), 
                      shape=(data['user_idx'].max() + 1, data['song_idx'].max() + 1))
    return ratings_coo_matrix.tocsr()

In [None]:
songs = prepare_data()
music_data = MusicData(songs)
songs = songs[['user_id', 'user_idx', 'song_idx', 'play_count']]
all_data = create_sparse_matrix(songs[['user_idx', 'song_idx', 'play_count']])
# train_data, test_data = train_test_split(songs, test_size=0.2, random_state=42)

# train_matrix = create_sparse_matrix(train_data)
# test_matrix = create_sparse_matrix(test_data)

In [None]:
if not Path('data/train_matrix.npz').exists():
    print('Saving data...')
    songs.to_csv('data/songs.csv', index=False)
    scipy.sparse.save_npz('data/train_matrix.npz', all_data)
    scipy.sparse.save_npz('data/coo_matrix.npz', coo)
else:
    print('Loading data...')
    all_data = scipy.sparse.load_npz('data/train_matrix.npz')
    songs = pd.read_csv('data/songs.csv')
    coo = scipy.sparse.load_npz('data/coo_matrix.npz')

In [None]:
# print(f"{np.allclose(all_data.toarray()[:100], coo.toarray()[:100])}")

In [None]:
model = implicit.als.AlternatingLeastSquares(factors=50, iterations=10, regularization=0.01)
model.fit(all_data)

In [None]:
def recommend_songs(model, songs, user_id, matrix, n=10):
    
    user_idx = songs[songs.user_id == user_id].user_idx.values[0]
    # songs_ids, scores = implict_model.recommend(user_idx, coo.tocsr()[n], N=n)

    recommendations = model.recommend(user_idx, matrix, N=n)
    recommended_song_ids = [songs.loc[songs['song_idx'] == idx, 'song_id'].iat[0] for idx, _ in recommendations]
    return [music_data.song_name(song_id) for song_id in recommended_song_ids]

userId = 'b7815dbb206eb2831ce0fe040d0aa537e2e800f7'
print("Songs recommended for a user:", recommend_songs(model, songs, userId, all_data, n=10))

In [None]:
songs

In [None]:
from implicit.evaluation import precision_at_k
# Evaluate using p@k

# implicit.evaluation.precision_at_k(model, train_user_items, test_user_items, int K=10, show_progress=True, int num_threads=1)
p_at_k = precision_at_k(model, train_matrix, test_matrix, K=10, show_progress=True, num_threads=1)
# average_precision = precision_at_k(model, test_matrix)
# print(f"Average Precision at k: {average_precision:.2f}")

In [None]:
p_at_k