In [1]:
import random
import functools
import pandas as pd
import numpy as np
import scipy
from scipy.sparse import coo_matrix
from sklearn.preprocessing import label_binarize
import implicit
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline
np.random.seed(41)

data = pd.read_csv('rs.csv')
del data['Unnamed: 0']
data['datetime'] = pd.to_datetime(data['datetime'], format='%Y%m%d %H:%M:%S')
N, D = data.shape

data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242124 entries, 0 to 242123
Data columns (total 4 columns):
user        242124 non-null int64
item        242124 non-null int64
qty         242124 non-null int64
datetime    242124 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(3)
memory usage: 7.4 MB


Unnamed: 0,user,item,qty,datetime
0,32011003635952,16198,1,2014-01-01 00:39:00
1,32009100468450,18107,4,2014-01-01 01:07:00
2,32013007873699,13612,2,2014-01-01 07:00:00
3,32011004803503,42496,1,2014-01-01 07:06:00
4,32011004221857,8875,4,2014-01-01 07:09:00


## Without temporal effect, implement top-k recommendation

In [2]:
TEST_SIZE = 0.3
data_nt = data[['user', 'item', 'qty']] # _nt stands for no temporality

# map each artist and user to a unique numeric value
data_nt['user'] = data_nt['user'].astype("category")
data_nt['item'] = data_nt['item'].astype("category")
data_nt['qty'] = data['qty'].astype(float)

#train test set split
train_nt = data_nt.sample(frac=1-TEST_SIZE)
test_nt = data_nt.drop(train_nt.index)
print('train size = {}, test size = {}.'.format(len(train_nt), len(test_nt)))

# get mapping from category to code for user
#             from code to category for item
m_u = list(zip(data_nt['user'], data_nt['user'].cat.codes))
mapping_user = {}
for cat, code in m_u:
    mapping_user[cat] = code
m_i = list(zip(data_nt['item'], data_nt['item'].cat.codes))
mapping_item = {}
for cat, code in m_i:
    mapping_item[code] = cat

train size = 169487, test size = 72637.


In [3]:
#split train into K fold
FACTOR_NUMBER = 50
RECOMMENDATION_NUMBER = 10
KFOLD = 5
fold_interval = np.array(np.array_split(np.arange(len(train_nt)), KFOLD))
print('fold interval: ', fold_interval)
         
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=FACTOR_NUMBER)

for i in range(KFOLD):
    train_idx = np.concatenate(fold_interval[np.arange(KFOLD)!=i])    
    train_val_nt = train_nt.iloc[train_idx, :]
    
        
    # create a sparse matrix of all the artist/user/play triples
    qtys = coo_matrix((train_val_nt['qty'], 
                   (train_val_nt['item'].cat.codes, 
                    train_val_nt['user'].cat.codes)))
    
    # train the model on a sparse matrix of item/user/confidence weights
    model.fit(qtys)
    user_items = qtys.T.tocsr()
    print('Shape of user-item matrix = {}.'.format(user_items.toarray().shape))
    
    # validation, recommend items for a user
    val_idx = np.concatenate(fold_interval[np.arange(KFOLD)==i])    
    val_nt = train_nt.iloc[val_idx, :]
    
    # transform validation userid to train_val userid
    correct_count = 0    
    for index in val_nt.index:
        userid = val_nt.loc[index, 'user']
        itemid = val_nt.loc[index, 'item']
        recommendations = model.recommend(mapping_user[userid], user_items, N=RECOMMENDATION_NUMBER)
        if itemid in [mapping_item[item] for item, score in recommendations]:
            correct_count += 1
    
    print('{}th accuracy: {}'.format(i, correct_count/len(val_nt)))

fold interval:  [array([    0,     1,     2, ..., 33895, 33896, 33897])
 array([33898, 33899, 33900, ..., 67793, 67794, 67795])
 array([ 67796,  67797,  67798, ..., 101690, 101691, 101692])
 array([101693, 101694, 101695, ..., 135587, 135588, 135589])
 array([135590, 135591, 135592, ..., 169484, 169485, 169486])]
Shape of user-item matrix = (2000, 1000).
0th accuracy: 0.020237182134639212
Shape of user-item matrix = (2000, 1000).
1th accuracy: 0.02159419434774913
Shape of user-item matrix = (2000, 1000).
2th accuracy: 0.021388323450452842
Shape of user-item matrix = (2000, 1000).
3th accuracy: 0.021358822314659114
Shape of user-item matrix = (2000, 1000).
4th accuracy: 0.02085730300616574


In [4]:
# Test
# create a sparse matrix of all the artist/user/play triples
qtys = coo_matrix((train_nt['qty'], 
               (train_nt['item'].cat.codes, 
                train_nt['user'].cat.codes)))

# train the model on a sparse matrix of item/user/confidence weights
model.fit(qtys)
user_items = qtys.T.tocsr()
print('Shape of user-item matrix = {}.'.format(user_items.toarray().shape))

# transform validation userid to train_val userid
correct_count = 0    
for index in test_nt.index:
    userid = test_nt.loc[index, 'user']
    itemid = test_nt.loc[index, 'item']
    recommendations = model.recommend(mapping_user[userid], user_items, N=RECOMMENDATION_NUMBER)
    if itemid in [mapping_item[item] for item, score in recommendations]:
        correct_count += 1

print('{}th accuracy: {}'.format(i, correct_count/len(test_nt)))

Shape of user-item matrix = (2000, 1000).
4th accuracy: 0.01786968074122004
