In [52]:
import sys
import pickle
import pandas as pd
sys.path.append('../utils')

from baseline import Baseline
from get_test import get_test
from baseline import Baseline
from metrics import precision_at_k
from scipy.sparse import load_npz, save_npz
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

In [2]:
test = pd.read_csv('../data/test.csv', sep=',')
result = test.groupby('user_id')['song'].unique().reset_index().rename(columns={'song': 'actual'})

In [4]:
user_item_matrix_ = load_npz('../data/train_sparse.npz')
user_item_matrix_ = user_item_matrix_.astype('float')

In [5]:
user_item_matrix_

<14907x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1943605 stored elements in COOrdinate format>

In [6]:
train = pd.read_csv('~/Downloads/train.csv')

In [8]:
train['dummy'] = 1

user_item_matrix = pd.pivot_table(train, 
                                  index='user_id', columns='song', 
                                  values='dummy',
                                  aggfunc='count', 
                                  fill_value=0)

user_item_matrix = user_item_matrix.astype('float')

In [9]:
sparse_matrix = csr_matrix(user_item_matrix).tocsr()

In [10]:
sparse_matrix

<14907x10000 sparse matrix of type '<class 'numpy.longlong'>'
	with 1943605 stored elements in Compressed Sparse Row format>

In [11]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

In [12]:
userid_to_id = {user_id: idx for idx, user_id in enumerate(userids)}
id_to_userid = {val: key for key, val in userid_to_id.items()}

item_to_id = {item_id: idx for idx, item_id in enumerate(itemids)}
id_to_item = {val: key for key, val in item_to_id.items()}

In [38]:
def get_recommendations(model, user_id, user_items_matrix=sparse_matrix, n=5, *args, **kwargs):
    recs = model.recommend(userid=userid_to_id[user_id], 
                         user_items=user_items_matrix,   # на вход user-item matrix   
                         N=n, # кол-во рекомендаций    
                         filter_already_liked_items=False,   
                         filter_items=None,   
                         recalculate_user=True, 
                         **kwargs)
    
    return [id_to_item[rec[0]] for rec in recs]

In [15]:
als_100 = AlternatingLeastSquares(factors=100, 
                              regularization=0.001,
                              iterations=15, 
                              calculate_training_loss=True, 
                              num_threads=4)

In [37]:
als_100.fit(sparse_matrix.T, show_progress=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [29]:
result

Unnamed: 0,user_id,actual
0,00055176fea33f6e027cd3302289378b,"[Imagine Dragons__I Bet My Life, Meghan Traino..."
1,0007f3dd09c91198371454c608d47f22,"[Lana Del Rey__Black Beauty, Evanescence__My I..."
2,000b0f32b5739f052b9d40fcc5c41079,"[Veronica Maggio__Hela huset, First Aid Kit__S..."
3,000c11a16c89aa4b14b328080f5954ee,"[Bastille__Pompeii, Pharrell Williams__Know Wh..."
4,00123e0f544dee3ab006aa7f1e5725a7,[Creedence Clearwater Revival__Long As I Can S...
...,...,...
13614,ffe32d5412269f3041c58cbf0dde3306,"[Diana Ross__I'm Coming Out, Aretha Franklin__..."
13615,ffec270eae226caa14ddaef291d73fff,[Walk the Moon__Shut Up and Dance]
13616,fff60baf392613ed33f745b89a9b38f7,"[Shaggy__Boombastic, Interpol__Everything Is W..."
13617,fff616055993498d6127f3f467cf9f2b,"[Matt and Kim__Let's Go, CAZZETTE__Sleepless -..."


In [39]:
result['als_100'] = result['user_id'].apply(lambda x: get_recommendations(als_100, x))

In [40]:
result

Unnamed: 0,user_id,actual,als_100
0,00055176fea33f6e027cd3302289378b,"[Imagine Dragons__I Bet My Life, Meghan Traino...","[Ed Sheeran__Thinking Out Loud, Mark Ronson__U..."
1,0007f3dd09c91198371454c608d47f22,"[Lana Del Rey__Black Beauty, Evanescence__My I...","[Ed Sheeran__The A Team, Ed Sheeran__I See Fir..."
2,000b0f32b5739f052b9d40fcc5c41079,"[Veronica Maggio__Hela huset, First Aid Kit__S...","[First Aid Kit__Emmylou, First Aid Kit__The Li..."
3,000c11a16c89aa4b14b328080f5954ee,"[Bastille__Pompeii, Pharrell Williams__Know Wh...","[Mark Ronson__Uptown Funk, Lorde__Royals, Disc..."
4,00123e0f544dee3ab006aa7f1e5725a7,[Creedence Clearwater Revival__Long As I Can S...,"[Led Zeppelin__Immigrant Song, Led Zeppelin__W..."
...,...,...,...
13614,ffe32d5412269f3041c58cbf0dde3306,"[Diana Ross__I'm Coming Out, Aretha Franklin__...","[Stevie Wonder__Signed, Sealed, Delivered (I'm..."
13615,ffec270eae226caa14ddaef291d73fff,[Walk the Moon__Shut Up and Dance],"[Calvin Harris__Under Control, Tiësto__Red Lig..."
13616,fff60baf392613ed33f745b89a9b38f7,"[Shaggy__Boombastic, Interpol__Everything Is W...","[Soft Cell__Tainted Love, Blondie__Call Me, Cu..."
13617,fff616055993498d6127f3f467cf9f2b,"[Matt and Kim__Let's Go, CAZZETTE__Sleepless -...","[Bon Iver__Holocene, Bon Iver__Skinny Love, Ba..."


In [42]:
result.apply(lambda row: precision_at_k(row['als_100'], row['actual']), axis=1).mean()

0.09997797195095087

In [51]:
with open('../models/als_100.pickle', 'wb') as f:
    pickle.dump(als_100, f)

In [53]:
save_npz('../data/train_user_item_matrix.npz', sparse_matrix)

In [43]:
als_256 = AlternatingLeastSquares(factors=256, 
                              regularization=0.001,
                              iterations=15, 
                              calculate_training_loss=True, 
                              num_threads=4)

In [46]:
als_256.fit(sparse_matrix.T, show_progress=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [47]:
result['als_256'] = result['user_id'].apply(lambda x: get_recommendations(als_256, x))

In [48]:
result.apply(lambda row: precision_at_k(row['als_256'], row['actual']), axis=1).mean()

0.09225346941772525