# MySpotify

In [1]:
import polars as pl
import numpy as np
import scipy
import implicit

from pathlib import Path

One very common problem to solve is when you have a number of users and a number of products, and you want to recommend which products are most likely to be useful for which users. There are many variations of this: for example, recommending movies (such as on Netflix), figuring out what to highlight for a user on a home page, deciding what stories to show in a social media feed, and so forth. There is a general solution to this problem, called collaborative filtering, which works like this: look at what products the current user has used or liked, find other users that have used or liked similar products, and then recommend other products that those users have used or liked.

## 1. Non-personalized approach.

### 1.1 Top-250 tracks

In [2]:
triplet_columns = ['user_id', 'song_id', 'play_count']
triplet_df = pl.read_csv('data/train_triplets.txt', separator='\t', new_columns=triplet_columns, use_pyarrow=True)
triplet_df

user_id,song_id,play_count
str,str,i64
"""b80344d063b5cc…","""SOAPDEY12A81C2…",1
"""b80344d063b5cc…","""SOBBMDR12A8C13…",2
"""b80344d063b5cc…","""SOBFNSP12AF72A…",1
"""b80344d063b5cc…","""SOBFOVM12A58A7…",1
"""b80344d063b5cc…","""SOBNZDC12A6D4F…",1
…,…,…
"""b7815dbb206eb2…","""SOUHHHH12AF729…",2
"""b7815dbb206eb2…","""SOUJVIT12A8C14…",1
"""b7815dbb206eb2…","""SOUSMXX12AB018…",1
"""b7815dbb206eb2…","""SOWYSKH12AF72A…",3


In [3]:
track_columns = ['track_id', 'song_id', 'artist', 'title']
unique_tracks_df = pl.read_csv('data/p02_unique_tracks.txt', separator=b'<SEP>', new_columns=track_columns, use_pyarrow=True)
unique_tracks_df

TypeError: ord() expected a character, but string of length 5 found

In [None]:
unique_tracks_df

In [None]:
unique_tracks_df.drop_duplicates(['song_id'], inplace=True)

In [None]:
if not Path('data/merged.csv').exists():
    merged_df = triplet_df.join(unique_tracks_df, on='song_id')
    merged_df['song'] = merged_df['title'] + ' - ' + merged_df['artist']
    merged_df.to_csv('data/merged.csv', index=False)
else:
    merged_df = pl.read_csv('data/merged.csv')
merged_df

In [None]:
song_play_counts = triplet_df.groupby('song_id')['play_count'].sum()
sorted_songs = song_play_counts.sort_values(ascending=False)

In [4]:
top_250_songs = sorted_songs.head(250)

# Merge the top 250 songs with the track metadata
top_250_tracks = pd.merge(top_250_songs, unique_tracks_df, on='song_id')

# Reset the index and rename columns
top_250_tracks.reset_index(inplace=True)
top_250_tracks.rename(columns={'index': 'index number'}, inplace=True)
top_250_tracks = top_250_tracks[['index number', 'artist', 'title', 'play_count']]

top_250_tracks.head(5)

NameError: name 'sorted_songs' is not defined

apply .head(5) to the resulting dataframe of the top-250 tracks, the result should be exactly like this:
artist title play_count
- 0 Dwight Yoakam You're The One 726885
- 1 Björk Undo 648239
- 2 Kings Of Leon Revelry 527893
- 3 Harmonia Sehr kosmisch 425463
- 4 Barry Tuckwell/Academy of St Martin-in-the-Fie... Horn Concerto No. 4 in E flat K495: II. Romanc... 389880

In [None]:
top_250_tracks.tail(5)

Apply .tail(5) to the resulting dataframe of the top-250 tracks, the result
should be exactly like this:
artist title play_count
- 245 Triple Six Mafia Now I'm High_ Really High 35253
- 246 The Red Jumpsuit Apparatus Face Down (Album Version) 35245
- 247 Linkin Park New Divide (Album Version) 35191
- 248 Selena Gomez & The Scene Naturally 35074
- 249 Creedence Clearwater Revival Have You Ever Seen The Rain 34831



### 2. Top-100 Tracks by Genre: Non-personalized approach.

In [None]:
genre_column_names = ['track_id', 'majority_genre', 'minority_genre']
tagtraum_genre_df = pd.read_csv('data/p02_msd_tagtraum_cd2.cls', sep='\t', comment='#', names=genre_column_names)
tagtraum_genre_df.drop(columns=['minority_genre'], axis=1, inplace=True)
tagtraum_genre_df

In [None]:
triplet_df

In [None]:
merged_df_genre = pd.merge(pd.merge(tagtraum_genre_df, unique_tracks_df, on='track_id'), triplet_df, on='song_id')

In [None]:
merged_df_genre

In [None]:
def get_top_and_bottom_tracks(merged_df_genre, selected_genre):
    # Filter by the specified genre
    genre_subset = merged_df_genre[merged_df_genre['majority_genre'] == selected_genre]

    # Aggregate play counts for each track in the selected genre
    track_play_counts = genre_subset.groupby(['artist', 'title'])['play_count'].sum()

    # Sort tracks by play count in descending order
    sorted_tracks = track_play_counts.sort_values(ascending=False).head(100)

    # Get the top 5 and bottom 5 tracks
    top_tracks = sorted_tracks.head(5)
    bottom_tracks = sorted_tracks.tail(5)

    return top_tracks, bottom_tracks

#### Rock

Apply .head(5) to the resulting dataframe of the top-100 tracks
- for the genre, the result should be exactly like this:
- artist title play_count
- 0 Björk Undo 648239
- 1 Kings Of Leon Revelry 527893
- 2 Harmonia Sehr kosmisch 425463
- 3 OneRepublic Secrets 292642
- 4 Tub Ring Invalid 268353

Apply .tail(5) to the resulting dataframe of the top-100 tracks
for the genre, the result should be exactly like this:
- artist title play_count
- 95 Metric Gold Guns Girls 28148
- 96 Pearl Jam Encore Break 27579
- 97 Daughtry No Surprise 27187
- 98 Eric Clapton Tears In Heaven 26999
- 99 Nick Lowe All Men Are Liars 26683

In [None]:
selected_genre = 'Rock'

top_tracks, bottom_tracks = get_top_and_bottom_tracks(merged_df_genre, selected_genre)

# Display the results
print(f"Top 5 tracks for the genre: {selected_genre}")
print(top_tracks)

print(f"\nBottom 5 tracks for the genre: {selected_genre}")
print(bottom_tracks)

### Rap

**Apply .head(5) to the resulting dataframe of the top-100 tracks
for the genre, the result should be exactly like this:**
artist title play_count
- 0 Alliance Ethnik Représente 241669
- 1 Beastie Boys The Maestro 72381
- 2 Eminem Without Me 63918
- 3 Black Eyed Peas Imma Be 62438
- 4 Kid Cudi Up Up & Away 59810

**Apply .tail(5) to the resulting dataframe of the top-100 tracks
for the genre, the result should be exactly like this:**
artist title play_count
- 95 Shwayze Buzzin' 7384
- 96 Orishas El Kilo 7324
- 97 Snoop Dogg Sexual Eruption 7171
- 98 Bone Thugs-N-Harmony Tha Crossroads 7124
- 99 Orishas Habana 6998

In [None]:
selected_genre = 'Rap'

top_tracks, bottom_tracks = get_top_and_bottom_tracks(merged_df_genre, selected_genre)

# Display the results
print(f"Top 5 tracks for the genre: {selected_genre}")
print(top_tracks)

print(f"\nBottom 5 tracks for the genre: {selected_genre}")
print(bottom_tracks)

### Electronic

**Apply .head(5) to the resulting dataframe of the top-100 tracks
for the genre, the result should be exactly like this:**
artist title play_count
- 0 Southside Spinners Luvstruck 84225
- 1 The Black Keys Tighten Up 81179
- 2 Deadmau5 Ghosts 'n' Stuff (Original Instrumental Mix) 63951
- 3 Daft Punk Harder Better Faster Stronger 63170
- 4 Clara Hill Clara meets Slope - Hard To Say 58887

**Apply .tail(5) to the resulting dataframe of the top-100 tracks
for the genre, the result should be exactly like this:**
artist title play_count
- 95 Nicolette No Government 9541
- 96 Two Door Cinema Club Eat That Up_ It's Good For You 9524
- 97 Moby Why Does My Heart Feel So Bad? (2006 Digital R... 9491
- 98 Death In Vegas Girls 9490
- 99 Johan Gielen Flash 9431

In [None]:
selected_genre = 'Electronic'

top_tracks, bottom_tracks = get_top_and_bottom_tracks(merged_df_genre, selected_genre)

# Display the results
print(f"Top 5 tracks for the genre: {selected_genre}")
print(top_tracks)

print(f"\nBottom 5 tracks for the genre: {selected_genre}")
print(bottom_tracks)

## Collections

> Task: Collections: 50 songs about love, 50 songs about war, 50 songs about happiness, 50songs about loneliness, 50 songs about money. Content-based approach.

> It should return on a given keyword (love, war, happiness) a dataframe (50tracks) with the following fields:
>  index number, artist name, track title, play count.  
> The table should be sorted by the play count descendingly. Try different approaches to these recommendations:  
> • baseline - when you look for the keyword and the number of its occurrences in a song, filter using some threshold and then sorting it by the play count,  
> • word2vec - when you look not only for the keyword but for several similar tokens as well using word2vec,  
> • classification task -you may label your data and try classification algorithms that will predict for the other part of the dataset if a track belongs to a specific class.  

In [None]:
import pandas as pd
# from gensim.models import Word2Vec
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import MultinomialNB

def load_musixmatch_dataset(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Initialize empty lists to hold the top words and data
    top_words = []
    data = []
    words_dict = {}
    
    # Parse each line
    for line in lines:
        line = line.strip()
        if line.startswith('#'):
            # This is a comment line, ignore it
            continue
        elif line.startswith('%'):
            # This is the list of top words
            top_words = line[1:].split(',')
            for i, word in enumerate(top_words):
                words_dict[word] = i+1
        else:
            # This is a normal data line
            data.append(line.split(','))
    
    # Convert the data to a DataFrame
    data = pd.DataFrame(data)
    
    return words_dict, data
idx2words, data = load_musixmatch_dataset('data/mxm_dataset_train.txt')

In [None]:
words2idx = {v: k for k, v in idx2words.items()}

In [None]:
if not Path('data/mxm_dataset_train.csv').exists():
    top_words, musixmatch = load_musixmatch_dataset('data/mxm_dataset_train.txt')
    musixmatch.columns = ['track_id', 'mxm_track_id', ] + list(range(1, 729))
    musixmatch.to_csv('data/mxm_dataset_train.csv', index=False)
else:
    musixmatch = pl.read_csv('data/mxm_dataset_train.csv')

In [None]:
musixmatch

In [None]:
import pandas as pd

# Sample data (replace this with your actual data)
data = [
"TRAAAAV128F421A322,4623710,1:6,2:4,3:2,4:2,5:5,6:3,7:1,8:1,11:1,12:2,13:3,14:1,15:1,18:2,19:2,20:2,21:2,23:4,25:1,26:2,28:1,30:1,36:2,42:1,45:1,54:2,56:1,57:1,68:1,99:1,192:2,249:1,264:1,356:1,389:1,561:1,639:1,656:1,687:1,761:1,773:1,804:1,869:2,914:1,1035:1,1156:1,1221:1,1287:1,1364:1,1407:1,1533:2,1857:1,2096:1,2117:1,2482:2,2548:1,2705:1,2723:1,2868:2,2992:2,3455:1,3717:1,3851:1,4322:1,4382:1,4613:1,4713:1,4906:1",
"TRAAABD128F429CF47,6477168,1:10,3:17,4:8,5:2,6:2,7:1,8:3,9:2,10:3,11:4,12:3,14:7,15:5,16:5,18:6,23:4,24:1,26:6,28:2,29:5,31:3,33:3,35:2,39:3,40:1,43:5,47:7,52:2,57:3,58:2,61:2,62:2,68:2,71:4,74:2,76:4,81:5,84:2,86:3,87:2,88:2,89:2,92:2,101:1,107:1,111:2,113:1,118:3,119:2,130:3,131:3,165:1,168:1,169:2,178:4,180:2,188:2,196:7,200:1,219:2,229:2,256:1,279:2,349:4,384:1,393:2,424:2,472:3,589:1,843:2,1038:2,1351:1,1542:2,2437:2",
"TRAAAED128E0783FAB,2516445,1:28,2:15,3:2,4:12,5:22,6:2,7:2,8:4,9:2,10:1,11:20,12:3,13:1,15:7,16:2,17:1,18:2,20:8,21:13,23:6,25:4,26:3,27:11,28:9,30:4,31:1,34:1,35:3,36:1,39:6,41:4,44:1,48:4,50:2,51:1,56:2,57:7,59:1,61:2,64:10,65:9,67:5,71:3,79:1,80:6,81:3,84:1,88:3,89:1,94:7,99:2,103:4,105:6,111:3,113:5,116:1,119:3,120:1,122:6,127:1,129:3,131:8,134:1,135:6,136:8,138:1,152:1,157:1,159:2,166:1,178:1,182:1,187:6,197:1,200:1,204:3,205:1,211:1,221:1,223:1,231:1,239:1,249:1,251:4,257:1,264:4,279:1,284:1,290:1,295:6,310:7,320:3,345:2,366:1,367:3,410:3,425:1,437:1,452:4,459:1,494:1,518:1,552:1,562:1,628:1,634:1,656:1,738:3,770:1,876:1,1090:1,1091:1,1271:1,1325:1,1412:1,1485:1,1535:3,2143:1,2467:1,3344:1",
"TRAAAEF128F4273421,3759847,1:5,2:4,3:3,4:2,5:1,6:11,9:4,12:9,13:3,15:2,16:1,18:1,23:3,29:1,30:1,32:1,35:1,40:1,53:3,56:1,57:3,59:1,61:1,63:1,68:3,80:1,91:1,93:1,98:1,101:1,105:1,115:1,123:1,129:17,146:9,155:1,171:1,178:13,204:1,225:1,228:1,251:1,264:1,278:1,279:1,287:1,290:1,323:1,330:1,340:3,440:1,618:1,651:1,927:1,2221:1,2261:1,2581:1,3452:1,4470:1",
"TRAAAEW128F42930C0,3783760,1:4,4:5,5:7,6:2,7:4,9:1,10:1,11:9,15:1,17:2,20:3,22:1,32:5,36:1,45:1,67:1,68:1,115:3,122:1,123:1,178:1,185:1,205:11,216:1,258:1,264:1,270:1,283:1,299:1,328:1,353:7,424:1,491:1,583:1,666:1,735:1,778:1,821:1,854:1,883:1,1169:1,1247:1,1310:1,1344:1,1350:1,1361:1,1497:1,1510:1,1634:1,1656:1,1801:3,1806:1,2931:1,2942:1,3439:1,3851:1,3876:1,4180:1,4240:1,4518:1,4554:1,4639:1,4758:1",
"TRAAAFD128F92F423A,6640025,1:16,2:4,4:1,5:3,6:5,7:5,8:3,9:4,10:3,11:2,12:6,13:3,16:1,17:2,18:2,20:4,22:2,23:1,24:1,26:4,30:1,34:1,37:1,43:2,48:1,57:1,58:1,59:1,61:1,65:2,67:1,68:1,74:4,83:1,84:1,93:1,95:1,97:2,99:1,101:1,108:1,119:2,126:2,137:2,139:2,141:1,159:1,163:1,184:1,186:1,193:1,201:2,264:3,270:1,279:1,281:3,299:1,414:1,449:2,460:4,468:2,537:1,549:2,591:1,608:1,647:2,672:1,689:1,855:1,1021:1,1146:1,1361:1,1523:1,1576:1,1582:2,1646:1,1655:2,1931:1,2602:1,2627:2,2676:1,3210:2",
"TRAAAGF12903CEC202,5493388,90:1,151:1,164:1,181:2,243:1,453:1,710:1,716:1,897:1,978:1,1349:1,1733:2,1736:1,2145:1,2806:1,4294:1,4330:1,4502:1,4684:1",
"TRAAAHJ128F931194C,5133845,1:4,2:11,3:2,4:7,5:3,6:5,8:1,9:3,10:6,11:6,13:9,14:3,16:3,17:1,19:1,20:1,23:3,25:3,27:1,30:3,34:1,35:1,37:1,39:1,40:3,50:3,58:3,66:1,71:1,72:1,74:3,78:1,81:4,82:1,83:3,93:3,111:1,122:1,127:1,139:1,140:2,159:2,172:1,184:1,192:1,194:4,267:3,283:1,287:3,361:1,384:4,404:1,451:2,464:1,468:1,510:3,538:1,544:4,571:1,647:1,654:1,684:1,728:1,733:1,825:1,870:1,891:1,946:1,1117:1,1249:4,1342:1,1409:1,1528:1,2172:1,2444:1,2495:1,2854:1,3158:1,3176:1,4412:1",
"TRAAAHZ128E0799171,1619153,1:39,2:30,3:10,4:10,5:28,6:21,7:1,8:20,9:11,10:12,11:9,12:10,13:9,14:1,15:5,16:11,17:4,18:1,19:1,20:17,22:7,23:4,24:1,26:2,27:2,28:5,29:1,30:6,31:5,32:4,33:2,35:1,36:1,37:8,39:2,40:3,41:2,44:1,45:11,48:16,49:1,50:2,51:2,52:6,54:4,55:3,57:2,58:1,59:1,61:1,62:4,65:1,66:2,69:3,70:1,72:2,76:7,78:3,81:1,82:1,85:2,86:1,88:2,93:2,94:1,95:3,98:1,99:1,100:1,106:3,107:1,108:1,109:1,110:1,111:11,116:1,119:1,133:1,134:2,135:3,136:4,137:1,153:1,155:1,166:1,185:1,190:1,192:1,200:1,201:1,203:2,205:3,206:1,207:1,210:1,216:2,219:1,223:3,228:1,239:1,242:1,244:1,261:1,262:1,277:1,278:1,285:1,289:3,291:3,302:1,304:1,323:1,325:1,327:2,332:6,338:1,339:1,344:1,349:1,355:1,357:3,358:1,363:1,368:1,371:1,378:3,389:1,396:6,410:1,413:1,427:1,431:2,439:1,446:1,448:1,452:1,455:3,456:1,470:1,494:1,500:1,503:1,520:1,525:1,537:1,539:1,546:1,548:1,555:4,578:1,597:1,608:1,615:2,618:3,631:1,646:1,658:2,659:1,663:1,693:1,707:3,712:1,739:1,752:3,811:1,836:2,867:1,868:3,886:2,901:1,918:1,924:1,948:1,1021:1,1037:1,1057:1,1064:2,1067:1,1097:1,1171:1,1182:1,1192:3,1195:2,1230:1,1260:3,1321:1,1330:1,1337:7,1354:2,1381:2,1400:1,1432:1,1508:1,1611:1,1640:1,1670:6,1683:1,1688:1,1745:1,1749:3,1813:1,1885:1,1915:4,1938:1,2016:1,2125:1,2149:1,2189:2,2246:1,2282:1,2337:2,2411:1,2468:1,2614:13,2648:1,2854:1,2917:1,2933:1,3195:1,3612:1,3656:1,3831:1,4036:1,4135:1",
"TRAAAJG128F9308A25,8525084,1:6,2:9,5:3,6:4,7:4,8:3,9:5,10:3,12:2,15:1,16:2,17:1,18:1,23:4,26:1,28:1,31:2,36:1,40:1,48:1,52:1,59:1,70:4,82:1,86:1,93:3,97:1,109:1,110:8,113:1,159:1,164:1,187:6,191:1,225:1,232:1,235:1,265:1,269:3,292:1,311:1,324:1,376:1,448:1,548:3,584:1,617:3,1161:3,1440:1,1496:1,1533:1,1869:1,2018:1,2207:1,2227:1,2537:1",

]

# Create DataFrame
columns = ['track_id', 'mxm_track_id', ] + list(range(1, 225))
songs_data = pd.DataFrame([d.split(',') for d in data], columns=columns)
songs_data

In [None]:
filtered_songs = []
for index, row in songs_data.iterrows():
    print(row[2:])

In [None]:
# Baseline Approach
def baseline_search(keyword, songs_data, threshold=3):
    filtered_songs = []
    for index, row in songs_data.iterrows():
        if row['word_occurrences'].get(keyword, 0) >= threshold:
            filtered_songs.append(row)
    filtered_songs = pd.DataFrame(filtered_songs)
    return filtered_songs.sort_values(by='play_count', ascending=False).head(50)

# Example usage:
filtered_songs_baseline = baseline_search('love', songs_data)
print(filtered_songs_baseline)


In [None]:
# Merge the data based on track_id
# merged_data = merged_df.join(musixmatch, on='track_id', how='inner')
# merged_data

In [None]:
def get_recommendations(data, keyword, method='baseline', threshold=5):
    if method == 'baseline':
        # Baseline approach: Count keyword occurrences in lyrics
        keyword_counts = data['word_counts'].apply(lambda x: x.count(keyword))
        filtered_data = data[keyword_counts >= threshold]
        recommendations = filtered_data[['artist', 'title', 'play_count']].sort_values('play_count', ascending=False).head(50)
        recommendations.reset_index(inplace=True)
        recommendations.index.name = 'index number'
        return recommendations
    
    # elif method == 'word2vec':
    #     # Word2Vec approach: Find similar tokens using Word2Vec
    #     lyrics = data['word_counts'].apply(lambda x: x.split(','))
    #     model = Word2Vec(lyrics, min_count=1)
    #     similar_words = [word for word, _ in model.wv.most_similar(keyword, topn=10)]
    #     filtered_data = data[data['word_counts'].apply(lambda x: any(word in x for word in similar_words))]
    #     recommendations = filtered_data[['artist', 'title', 'play_count']].sort_values('play_count', ascending=False).head(50)
    #     recommendations.reset_index(inplace=True)
    #     recommendations.index.name = 'index number'
    #     return recommendations
    
    # elif method == 'classification':
    #     # Classification approach: Label data and predict class
    #     vectorizer = CountVectorizer()
    #     X = vectorizer.fit_transform(data['word_counts'])
    #     y = data['word_counts'].apply(lambda x: keyword in x).astype(int)
    #     clf = MultinomialNB()
    #     clf.fit(X, y)
    #     predictions = clf.predict(X)
    #     filtered_data = data[predictions == 1]
    #     recommendations = filtered_data[['artist', 'title', 'play_count']].sort_values('play_count', ascending=False).head(50)
    #     recommendations.reset_index(inplace=True)
    #     recommendations.index.name = 'index number'
    #     return recommendations
    
    # else:
    #     raise ValueError(f"Invalid method: {method}")

In [None]:
try:
    keyword = 'love'
    recommendations = get_recommendations(musixmatch, keyword, method='baseline', threshold=5)
    print(f"Top 50 tracks for the keyword '{keyword}':")
    print(recommendations)
except ValueError as e:
    print(str(e))

In [None]:
songs = merged_df[['user_id', 'song_id', 'track_id', 'song', 'play_count']]
songs

In [None]:
songs['user_id'] = pd.factorize(songs['user_id'])[0]
songs['song_id'] = pd.factorize(songs['song_id'])[0]
songs

In [None]:
class MusicData:
    def __init__(self, data):
        self.data = data
        self.song_id_to_name = pd.Series(data.song.values, index=data.song_id).to_dict()

    def get_user_songs(self, user_id):
        user_data = self.data[self.data['user_id'] == user_id]
        user_songs = [self.song_id_to_name[song_id] for song_id in user_data['song_id'].unique()]
        return user_songs

    def get_song_users(self, song_id):
        song_data = self.data[self.data['song_id'] == song_id]
        song_users = song_data['user_id'].unique()
        return song_users

    def get_song_name(self, song_id):
        return self.song_id_to_name.get(song_id, "Song ID not found in data")

    def get_top_songs(self, n=10):
        top_songs = self.data['song_id'].value_counts()[:n].index.tolist()
        return top_songs

    def get_top_users(self, n=10):
        top_users = self.data['user_id'].value_counts()[:n].index.tolist()
        return top_users

music_data = MusicData(songs)

In [None]:
X = songs[['user_id', 'song_id', 'play_count']]
X

In [None]:
X.user_id.nunique(), X.song_id.nunique(), 

In [None]:
print("Songs listened by a user:", music_data.get_user_songs(0))

In [None]:
print("Song name for a song ID:", music_data.get_song_name(0)) 

In [None]:
X.set_index(["user_id", "song_id"], inplace=True)

In [None]:
X.sample(10000)

In [None]:
X.index.get_level_values(0)[:30]

In [None]:
X.index.get_level_values(1)

In [None]:
coo = scipy.sparse.coo_matrix( (X.play_count.astype(float), (X.index.get_level_values(0), X.index.get_level_values(1),),))

In [None]:
arr = coo.tocsr().toarray()
arr.shape

In [None]:
implict_model = implicit.als.AlternatingLeastSquares(
    factors=50, iterations=10, regularization=0.01
)

In [None]:
implict_model.fit(coo.tocsr())

In [None]:
user_id = 65
n = 100
songs_ids, scores = implict_model.recommend(user_id, coo.tocsr()[n], N=n)

In [None]:
songs_ids[:10]

In [None]:
scores[:10]

In [None]:
[music_data.get_song_name(song_id) for song_id in songs_ids[:20]]

In [None]:
music_data.get_user_songs(user_id)

In [None]:
songs.song[100:150]

In [None]:
itemids, scores = implict_model.similar_items(itemid=118)

In [None]:
[music_data.get_song_name(item_id) for item_id in itemids]