In [None]:
! wget http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-360K.tar.gz

In [None]:
!tar -xf lastfm-dataset-360K.tar.gz

In [1]:
import numpy as np
from scipy import sparse

In [2]:
import pandas as pd
df_plays = pd.read_csv('lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv', 
                        sep='\t',
                        header=None,
                        names=["user-mboxsha1", "musicbrainz-artist-id", "artist-name", "plays"])
df_plays

Unnamed: 0,user-mboxsha1,musicbrainz-artist-id,artist-name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706
...,...,...,...,...
17535650,"sep 20, 2008",7ffd711a-b34d-4739-8aab-25e045c246da,turbostaat,12
17535651,"sep 20, 2008",9201190d-409f-426b-9339-9bd7492443e2,cuba missouri,11
17535652,"sep 20, 2008",e7cf7ff9-ed2f-4315-aca8-bcbd3b2bfa71,little man tate,11
17535653,"sep 20, 2008",f6f2326f-6b25-4170-b89d-e235b25508e8,sigur rós,10


In [3]:
df_profiles = pd.read_csv('lastfm-dataset-360K/usersha1-profile.tsv', 
                           sep='\t',
                           header=None,
                           names=["user-mboxsha1", "gender", "age", "country", "signup"])
df_profiles

Unnamed: 0,user-mboxsha1,gender,age,country,signup
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007"
1,00001411dc427966b17297bf4d69e7e193135d89,f,,Canada,"Dec 4, 2007"
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,,,Germany,"Sep 1, 2006"
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,m,19.0,Mexico,"Apr 28, 2008"
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,m,28.0,United States,"Jan 27, 2006"
...,...,...,...,...,...
359342,fffe7823f67b433b45f22056467db921c1d3d7d0,m,25.0,Germany,"Jun 24, 2006"
359343,fffe8637bd8234309e871409c7ebef99a720afc1,m,25.0,Brazil,"Sep 9, 2007"
359344,fffe8c7f952d9b960a56ed4dcb40a415d924b224,m,20.0,United States,"Aug 8, 2007"
359345,ffff9af9ae04d263dae91cb838b1f3a6725f5ffb,m,20.0,Russian Federation,"Dec 3, 2005"


In [4]:
countries = ["United Kingdom", "United States"]
df_countries = df_plays.merge(df_profiles.query("country in @countries"))
df_countries.shape

(4837010, 8)

In [5]:
total_plays = (df_countries
               .groupby('musicbrainz-artist-id', as_index=False)
               .agg(sum)
               .sort_values("plays", ascending=False)
               .rename(columns={"plays": "total_plays"})
               .reset_index(drop=True))
total_plays.shape

(100070, 3)

In [6]:
THRESHOLD = 7000
total_plays['cumulative_sum'] = total_plays['total_plays'].cumsum()
total_plays['cumulative_percentage'] = 100 * total_plays['cumulative_sum'] / total_plays['total_plays'].sum()
total_plays.iloc[THRESHOLD]

musicbrainz-artist-id    1d940f58-bff7-451c-b5cb-2406cca45eb3
total_plays                                             14738
age                                                      1145
cumulative_sum                                     1027489130
cumulative_percentage                                 90.2369
Name: 7000, dtype: object

In [7]:
popular_artists = total_plays[total_plays.index < THRESHOLD]
df_popular = (df_countries
              .merge(popular_artists, on="musicbrainz-artist-id")
              .reset_index(drop=True)
              .drop_duplicates(['user-mboxsha1', 'musicbrainz-artist-id']))
df_popular.shape

(4026262, 12)

In [8]:
users = df_popular["user-mboxsha1"].unique()
users = pd.DataFrame(users)
users = users.reset_index()
users.columns = ["user_idx", "user-mboxsha1"]

items = df_popular["musicbrainz-artist-id"].unique()
items = pd.DataFrame(items)
items = items.reset_index()
items.columns = ["item_idx", "musicbrainz-artist-id"]

df_combined = (df_popular
               .merge(items)
               .merge(users))
df_combined.shape

(4026262, 14)

In [None]:
ratings = sparse.lil_matrix((items.shape[0], users.shape[0]))

for idx, row in df_combined.iterrows():
    ratings[row["item_idx"], row["user_idx"]] = 1

In [14]:
ratings = ratings.tocsr()    

In [10]:
# sparse.save_npz("user_item_ratings.npz", ratings)
# ratings = sparse.load_npz("user_item_ratings.npz")

In [65]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)
knn.fit(ratings)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [66]:
item_mapping = df_combined[["musicbrainz-artist-id", "artist-name", "item_idx"]].drop_duplicates()

In [74]:
def recommend_items(item_name, model, item_mapping, ratings):
    idx = item_mapping[item_mapping["artist-name"] == item_name]
    X_pred = ratings[idx["item_idx"], :].reshape(1, -1)
    distances, indices = model.kneighbors(X_pred, n_neighbors = 6)
    distances = distances.flatten()
    indices = indices.flatten()    

    print("Recommendations for {0}".format(item_name))
    for i in range(1, len(distances)):
        rec = item_mapping[item_mapping["item_idx"] == indices[i]].iloc[0]
        print('{0}: {1}'.format(i, rec["artist-name"]))

recommend_items("rolling stones", knn, item_mapping, ratings)             

Recommendations for rolling stones
1: the beatles
2: bob dylan
3: the who
4: led zeppelin
5: david bowie
