In [1]:
import pandas as pd
import numpy as np
from lightfm.datasets import fetch_movielens

data = fetch_movielens('movielens', indicator_features=False, genre_features=True)

print('original train')
print(np.unique(data['train'].data))
print(data['train'].__repr__())
print('original test')
print(np.unique(data['test'].data))
print(data['test'].__repr__())

# binarizing traing examples as in the original lightfm paper to use the logistic loss
data['train'].data = np.array([-1, 1])[1 * (data['train'].data >= 4)]
data['test'].data = np.array([-1, 1])[1 * (data['test'].data >= 4)]

# should keep only positive test interactions
data['test_positive_only'] = data['test'].copy()
data['test_positive_only'].data = 1 *(data['test_positive_only'].data>=1)
data['test_positive_only'].eliminate_zeros()

train = data['train']
test = data['test']
test_positives = data['test_positive_only']

print('train')
print(np.unique(data['train'].data))
print(data['train'].__repr__())
print('test')
print(np.unique(data['test'].data))
print(data['test'].__repr__())
print('test_positive_only')
print(np.unique(data['test_positive_only'].data))
print(data['test_positive_only'].__repr__())

item_features = data['item_features']
tag_labels = data['item_feature_labels']
print('There are %s distinct item features, with values like %s.' % (item_features.shape[1], tag_labels[:3].tolist()))



original train
[1 2 3 4 5]
<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 90570 stored elements in COOrdinate format>
original test
[1 2 3 4 5]
<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 9430 stored elements in COOrdinate format>
train
[-1  1]
<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 90570 stored elements in COOrdinate format>
test
[-1  1]
<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 9430 stored elements in COOrdinate format>
test_positive_only
[1]
<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 5469 stored elements in COOrdinate format>
There are 19 distinct item features, with values like ['genre:unknown', 'genre:Action', 'genre:Adventure'].


In [2]:
train_df = pd.DataFrame.from_dict({
        'user': train.row,
        'item': train.col,
        'rating': train.data,
    })

test_df = pd.DataFrame.from_dict({
        'user': test.row,
        'item': test.col,
        'rating': test.data,
    })

print(train_df.shape)
train_df.head()

test_user_ids = test_df.user.unique()
all_user_ids = train_df.user.unique()
all_item_ids = np.unique(data['item_features'].tocoo().row)

def to_all_user_items(user_ids, item_ids):
    return pd.DataFrame.from_dict(
        {'user': np.repeat(user_ids, len(item_ids)),
         'item': np.tile(item_ids, len(user_ids))})

all_user_items = to_all_user_items(all_user_ids, all_item_ids)
print(all_user_items.shape)
print(test_user_ids.shape)
all_user_items.head()

(90570, 3)
(1586126, 2)
(943,)


Unnamed: 0,item,user
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [3]:
from lightfm import LightFM
from lightfm.evaluation import reciprocal_rank, auc_score, recall_at_k, precision_at_k

cf_model = LightFM(loss='logistic', item_alpha=0.0001, no_components=10, learning_rate=0.001)
cf_model.fit(train, epochs=20)

print('Collaborative filtering train/test MRR: %.3f / %.3f'
      % (reciprocal_rank(cf_model, data['train']).mean(),
         reciprocal_rank(cf_model, data['test']).mean()))

print('Collaborative filtering train/test AUC: %.3f / %.3f'
      % (auc_score(cf_model, data['train']).mean(),
         auc_score(cf_model, data['test'], train_interactions=None).mean()))

Collaborative filtering train/test MRR: 0.609 / 0.179
Collaborative filtering train/test AUC: 0.658 / 0.647


In [4]:
from lightfm import LightFM
from lightfm.evaluation import reciprocal_rank, auc_score, recall_at_k

cf_model = LightFM(loss='bpr', item_alpha=0.0001, no_components=20)
cf_model.fit(train, epochs=10)

print('Collaborative filtering train/test MRR: %.3f / %.3f'
      % (reciprocal_rank(cf_model, data['train']).mean(),
         reciprocal_rank(cf_model, data['test']).mean()))

print('Collaborative filtering train/test AUC: %.3f / %.3f'
      % (auc_score(cf_model, data['train']).mean(),
         auc_score(cf_model, data['test'], train_interactions=None).mean()))

Collaborative filtering train/test MRR: 0.804 / 0.255
Collaborative filtering train/test AUC: 0.854 / 0.827


In [5]:
from lightfm import LightFM
from lightfm.evaluation import reciprocal_rank, auc_score

cf_model = LightFM(loss='warp', item_alpha=0.0001, no_components=20)
cf_model.fit(train, epochs=10)

print('Collaborative filtering train/test MRR: %.3f / %.3f'
      % (reciprocal_rank(cf_model, data['train']).mean(),
         reciprocal_rank(cf_model, data['test']).mean()))

print('Collaborative filtering train/test AUC: %.3f / %.3f'
      % (auc_score(cf_model, data['train']).mean(),
         auc_score(cf_model, data['test'], train_interactions=None).mean()))

Collaborative filtering train/test MRR: 0.813 / 0.262
Collaborative filtering train/test AUC: 0.920 / 0.887
