Skip to content

Commit

Permalink
First working models.
Browse files Browse the repository at this point in the history
  • Loading branch information
maciejkula committed Jun 26, 2017
1 parent c0a6fc6 commit d92f3c5
Show file tree
Hide file tree
Showing 8 changed files with 635 additions and 9 deletions.
68 changes: 68 additions & 0 deletions spotlight/cross_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import numpy as np

from spotlight.interactions import Interactions


def _index_or_none(array, shuffle_index):

if array is None:
return None
else:
return array[shuffle_index]


def shuffle_interactions(interactions,
random_state=None):

if random_state is None:
random_state = np.random.RandomState()

shuffle_indices = np.arange(len(interactions.user_ids))
random_state.shuffle(shuffle_indices)

return Interactions(interactions.user_ids[shuffle_indices],
interactions.item_ids[shuffle_indices],
ratings=_index_or_none(interactions.ratings,
shuffle_indices),
timestamps=_index_or_none(interactions.timestamps,
shuffle_indices),
weights=_index_or_none(interactions.weights,
shuffle_indices),
num_users=interactions.num_users,
num_items=interactions.num_items)


def random_train_test_split(interactions,
test_percentage=0.2,
random_state=None):

interactions = shuffle_interactions(interactions,
random_state=random_state)

cutoff = int((1.0 - test_percentage) * len(interactions))

train_idx = slice(None, cutoff)
test_idx = slice(cutoff, None)

train = Interactions(interactions.user_ids[train_idx],
interactions.item_ids[train_idx],
ratings=_index_or_none(interactions.ratings,
train_idx),
timestamps=_index_or_none(interactions.timestamps,
train_idx),
weights=_index_or_none(interactions.weights,
train_idx),
num_users=interactions.num_users,
num_items=interactions.num_items)
test = Interactions(interactions.user_ids[test_idx],
interactions.item_ids[test_idx],
ratings=_index_or_none(interactions.ratings,
test_idx),
timestamps=_index_or_none(interactions.timestamps,
test_idx),
weights=_index_or_none(interactions.weights,
test_idx),
num_users=interactions.num_users,
num_items=interactions.num_items)

return train, test
3 changes: 2 additions & 1 deletion spotlight/datasets/movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import h5py

from spotlight.datasets import _transport
from spotlight.interactions import Interactions

VARIANTS = ('100K',
'1M',
Expand Down Expand Up @@ -41,4 +42,4 @@ def get_movielens_dataset(variant='100K'):

url = 'movielens_{}'.format(variant)

return _get_movielens(url)
return Interactions(*_get_movielens(url))
29 changes: 29 additions & 0 deletions spotlight/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import numpy as np

import scipy.stats as st


def mrr_score(model, test, train=None):

test = test.tocsr()

if train is not None:
train = train.tocsr()

mrrs = []

for user_id, row in enumerate(test):

if not len(row.indices):
continue

predictions = -model.predict(user_id)

if train is not None:
predictions[train[user_id].indices] = np.finfo(np.float32).max

mrr = (1.0 / st.rankdata(predictions)[row.indices]).mean()

mrrs.append(mrr)

return np.array(mrrs)
189 changes: 189 additions & 0 deletions spotlight/factorization/explicit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import numpy as np

import torch

import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

from torch.autograd import Variable


from spotlight.factorization.implicit import BilinearNet
from spotlight.losses import (bpr_loss,
hinge_loss,
pointwise_loss)
from spotlight.torch_utils import cpu, gpu, minibatch, shuffle


class TruncatedBilinearNet(nn.Module):

def __init__(self, num_users, num_items, embedding_dim, sparse=False):
super().__init__()

self.embedding_dim = embedding_dim

self.rating_net = BilinearNet(num_users, num_items,
embedding_dim, sparse=sparse)
self.observed_net = BilinearNet(num_users, num_items,
embedding_dim, sparse=sparse)

self.stddev = nn.Embedding(1, 1)

def forward(self, user_ids, item_ids):

observed = F.sigmoid(self.observed_net(user_ids, item_ids))
rating = self.rating_net(user_ids, item_ids)
stddev = self.stddev((user_ids < -1).long()).view(-1, 1)

return observed, rating, stddev


class ExplicitFactorizationModel(object):
"""
A number of classic factorization models, implemented in PyTorch.
Available loss functions:
- pointwise logistic
- BPR: Rendle's personalized Bayesian ranking
- adaptive: a variant of WARP with adaptive selection of negative samples
- regression: minimizing the regression loss between true and predicted ratings
- truncated_regression: truncated regression model, that jointly models
the likelihood of a rating being given and the value
of the rating itself.
Performance notes: neural network toolkits do not perform well on sparse tasks
like recommendations. To achieve acceptable speed, either use the `sparse` option
on a CPU or use CUDA with very big minibatches (1024+).
"""

def __init__(self,
loss='regression',
embedding_dim=64,
n_iter=3,
batch_size=64,
optimizer=None,
use_cuda=False,
sparse=False):

assert loss in ('regression'
'truncated_regression')

self._loss = loss
self._embedding_dim = embedding_dim
self._n_iter = n_iter
self._batch_size = batch_size
self._use_cuda = use_cuda
self._sparse = sparse
self._optimizer = None

self._num_users = None
self._num_items = None
self._net = None

def fit(self, interactions, verbose=False):
"""
Fit the model.
Arguments
---------
interactions: np.float32 coo_matrix of shape [n_users, n_items]
the matrix containing
user-item interactions. The entries can be binary
(for implicit tasks) or ratings (for regression
and truncated regression).
verbose: Bool, optional
Whether to print epoch loss statistics.
"""

self._num_users, self._num_items = interactions.shape

if self._loss == 'regression':
self._net = gpu(
BilinearNet(self._num_users,
self._num_items,
self._embedding_dim,
sparse=self._sparse),
self._use_cuda
)
else:
self._net = gpu(
TruncatedBilinearNet(self._num_users,
self._num_items,
self._embedding_dim,
sparse=self._sparse),
self._use_cuda
)

if self._optimizer is None:
self._optimizer = optim.Adam(self._net.parameters())

if self._loss == 'pointwise':
loss_fnc = pointwise_loss
elif self._loss == 'bpr':
loss_fnc = bpr_loss
elif self._loss == 'hinge':
loss_fnc = hinge_loss

for epoch_num in range(self._n_iter):

users, items, ratings = shuffle(*(interactions.row,
interactions.col,
interactions.data))

user_ids_tensor = gpu(torch.from_numpy(users),
self._use_cuda)
item_ids_tensor = gpu(torch.from_numpy(items),
self._use_cuda)

epoch_loss = 0.0

for (batch_user,
batch_item,
batch_ratings) in minibatch(user_ids_tensor,
item_ids_tensor,
batch_size=self._batch_size):

user_var = Variable(batch_user)
item_var = Variable(batch_item)
ratings_var = Variable(batch_ratings)

self._optimizer.zero_grad()

loss = loss_fnc(user_var, item_var, ratings_var)
epoch_loss += loss.data[0]

loss.backward()
self._optimizer.step()

if verbose:
print('Epoch {}: loss {}'.format(epoch_num, epoch_loss))

def predict(self, user_ids, item_ids):
"""
Compute the recommendation score for user-item pairs.
Arguments
---------
user_ids: integer or np.int32 array of shape [n_pairs,]
single user id or an array containing the user ids for the user-item pairs for which
a prediction is to be computed
item_ids: np.int32 array of shape [n_pairs,]
an array containing the item ids for the user-item pairs for which
a prediction is to be computed.
ratings: bool, optional
Return predictions on ratings (rather than likelihood of rating)
"""

user_ids = torch.from_numpy(user_ids.reshape(-1, 1).astype(np.int64))
item_ids = torch.from_numpy(item_ids.reshape(-1, 1).astype(np.int64))

user_var = Variable(gpu(user_ids, self._use_cuda))
item_var = Variable(gpu(item_ids, self._use_cuda))

out = self._net(user_var, item_var)

return cpu(out.data).numpy().flatten()

0 comments on commit d92f3c5

Please sign in to comment.