In [65]:
import pandas as pd
import numpy as np
import json
import pickle
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from scipy.sparse import save_npz

In [2]:
df = pd.read_csv('/Users/vlad/Downloads/spotify_dataset.csv', skiprows=1,
                         names=['user_id', 'artistname', 'trackname', 'playlistname'],
                         on_bad_lines='skip')

In [3]:
df['artistname'].fillna('', inplace=True)
df['trackname'].fillna('', inplace=True)

df['song'] = df['artistname'] + '__' + df['trackname']

In [None]:
# songs = df=['song'].value_counts()

In [4]:
with open('songs.json') as f:
    top_songs = json.load(f)

### most popular songs

In [6]:
most_popular = set(top_songs[:9999])

# df = df[df['song'].isin(most_popular)]

In [9]:
train, test = train_test_split(df, test_size=0.15)

In [10]:
train = train[df['song'].isin(most_popular)]

  train = train[df['song'].isin(most_popular)]


### Test dataset for quality evaluation

In [17]:
test = test[test['user_id'].isin(set(train['user_id'].to_list()))]

result = test.groupby('user_id')['song'].unique().reset_index()

In [91]:
with open('test_index.json', 'w') as f:
    json.dump(test.index.to_list(), f)

In [23]:
result.head()

Unnamed: 0,user_id,song
0,00055176fea33f6e027cd3302289378b,"[5 Seconds Of Summer__Everything I Didn't Say,..."
1,0007f3dd09c91198371454c608d47f22,"[The Neighbourhood__Afraid, Daughter__Youth, S..."
2,000b0f32b5739f052b9d40fcc5c41079,"[Samuel Ljungblahd__Want To, Need To, Got to (..."
3,000c11a16c89aa4b14b328080f5954ee,"[Cat Stevens__Wild World, Jake Bugg__All Your ..."
4,00123e0f544dee3ab006aa7f1e5725a7,"[Barefoot Jerry__I'm Proud to be a Redneck, To..."


In [92]:
result.shape

(14560, 3)

### encode user_id

In [25]:
train['dummy'] = 1

In [26]:
user_item_matrix = pd.pivot_table(train, 
                                  index='user_id', columns='song', 
                                  values='dummy',
                                  aggfunc='count', 
                                  fill_value=0)

In [27]:
user_item_matrix = user_item_matrix.astype(float)

In [28]:
sparse_matrix = csr_matrix(user_item_matrix).tocsr()

In [69]:
sparse_matrix

<14910x9999 sparse matrix of type '<class 'numpy.float64'>'
	with 1943751 stored elements in Compressed Sparse Row format>

In [29]:
als = AlternatingLeastSquares(factors=100, 
                              regularization=0.001,
                              iterations=15, 
                              calculate_training_loss=True, 
                              num_threads=4)

In [30]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

In [31]:
userid_to_id = {user_id: idx for idx, user_id in enumerate(userids)}
id_to_userid = {val: key for key, val in userid_to_id.items()}

item_to_id = {item_id: idx for idx, item_id in enumerate(itemids)}
id_to_item = {val: key for key, val in item_to_id.items()}

In [97]:
for name, dict_ in zip(('userid_to_id', 'id_to_userid', 'item_to_id', 'id_to_item'), 
                       (userid_to_id, id_to_userid, item_to_id, id_to_item)):
    with open(f'app/data/{name}.json', 'w') as f:
        json.dump(dict_, f)

In [33]:
als.fit(sparse_matrix.T, show_progress=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [47]:
def get_recommendations(model, user_id, user_items_matrix=sparse_matrix, n=5, *args, **kwargs):
    recs = model.recommend(userid=userid_to_id[user_id], 
                         user_items=user_items_matrix,   # на вход user-item matrix   
                         N=n, # кол-во рекомендаций    
                         filter_already_liked_items=False,   
                         filter_items=None,   
                         recalculate_user=True, 
                         **kwargs)
    
    return [id_to_item[rec[0]] for rec in recs]

In [48]:
get_recommendations(als, user_id='00055176fea33f6e027cd3302289378b')

['Ed Sheeran__Thinking Out Loud',
 "Ed Sheeran__Don't",
 'Mark Ronson__Uptown Funk',
 'Ed Sheeran__Sing',
 'Meghan Trainor__All About That Bass']

In [52]:
from typing import List


def precision_at_k(recommended_list: List[str], true_list: List[str], k=5) -> float:
    true_list = np.array(true_list)
    recommended_list = np.array(recommended_list)
    recommended_list = recommended_list[:k]
    
    flags = np.isin(true_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
    
    return precision

In [54]:
result['recs'] = result['user_id'].apply(lambda x: get_recommendations(als, x))

In [58]:
result.rename(columns={'song': 'actual'}, inplace=True)

In [59]:
result.head()

Unnamed: 0,user_id,actual,recs
0,00055176fea33f6e027cd3302289378b,"[5 Seconds Of Summer__Everything I Didn't Say,...","[Ed Sheeran__Thinking Out Loud, Ed Sheeran__Do..."
1,0007f3dd09c91198371454c608d47f22,"[The Neighbourhood__Afraid, Daughter__Youth, S...","[Ed Sheeran__I See Fire, Ed Sheeran__Thinking ..."
2,000b0f32b5739f052b9d40fcc5c41079,"[Samuel Ljungblahd__Want To, Need To, Got to (...","[First Aid Kit__Emmylou, First Aid Kit__The Li..."
3,000c11a16c89aa4b14b328080f5954ee,"[Cat Stevens__Wild World, Jake Bugg__All Your ...","[Mark Ronson__Uptown Funk, Haim__The Wire, Hai..."
4,00123e0f544dee3ab006aa7f1e5725a7,"[Barefoot Jerry__I'm Proud to be a Redneck, To...","[Led Zeppelin__Ramble On, Imagine Dragons__Rad..."


In [63]:
result.apply(lambda row: precision_at_k(row['recs'], row['actual']), axis=1).mean()

0.09589285714285714

In [68]:
save_npz('app/utils/sparse_matrix.npz', sparse_matrix)

### Угадываем 0.5 из 5 рекомендованных песен, можно улучшать рекомендации:))

In [64]:
with open('als_model.pkl', 'wb') as f:
    pickle.dump(als, f)