First working models.

maciejkula · Jun 26, 2017 · d92f3c5 · d92f3c5
1 parent c0a6fc6
commit d92f3c5
Show file tree

Hide file tree

Showing 8 changed files with 635 additions and 9 deletions.
diff --git a/spotlight/cross_validation.py b/spotlight/cross_validation.py
@@ -0,0 +1,68 @@
+import numpy as np
+
+from spotlight.interactions import Interactions
+
+
+def _index_or_none(array, shuffle_index):
+
+    if array is None:
+        return None
+    else:
+        return array[shuffle_index]
+
+
+def shuffle_interactions(interactions,
+                         random_state=None):
+
+    if random_state is None:
+        random_state = np.random.RandomState()
+
+    shuffle_indices = np.arange(len(interactions.user_ids))
+    random_state.shuffle(shuffle_indices)
+
+    return Interactions(interactions.user_ids[shuffle_indices],
+                        interactions.item_ids[shuffle_indices],
+                        ratings=_index_or_none(interactions.ratings,
+                                               shuffle_indices),
+                        timestamps=_index_or_none(interactions.timestamps,
+                                                  shuffle_indices),
+                        weights=_index_or_none(interactions.weights,
+                                               shuffle_indices),
+                        num_users=interactions.num_users,
+                        num_items=interactions.num_items)
+
+
+def random_train_test_split(interactions,
+                            test_percentage=0.2,
+                            random_state=None):
+
+    interactions = shuffle_interactions(interactions,
+                                        random_state=random_state)
+
+    cutoff = int((1.0 - test_percentage) * len(interactions))
+
+    train_idx = slice(None, cutoff)
+    test_idx = slice(cutoff, None)
+
+    train = Interactions(interactions.user_ids[train_idx],
+                         interactions.item_ids[train_idx],
+                         ratings=_index_or_none(interactions.ratings,
+                                                train_idx),
+                         timestamps=_index_or_none(interactions.timestamps,
+                                                   train_idx),
+                         weights=_index_or_none(interactions.weights,
+                                                train_idx),
+                         num_users=interactions.num_users,
+                         num_items=interactions.num_items)
+    test = Interactions(interactions.user_ids[test_idx],
+                        interactions.item_ids[test_idx],
+                        ratings=_index_or_none(interactions.ratings,
+                                               test_idx),
+                        timestamps=_index_or_none(interactions.timestamps,
+                                                  test_idx),
+                        weights=_index_or_none(interactions.weights,
+                                               test_idx),
+                        num_users=interactions.num_users,
+                        num_items=interactions.num_items)
+
+    return train, test
diff --git a/spotlight/datasets/movielens.py b/spotlight/datasets/movielens.py
@@ -3,6 +3,7 @@
 import h5py
 
 from spotlight.datasets import _transport
+from spotlight.interactions import Interactions
 
 VARIANTS = ('100K',
             '1M',
@@ -41,4 +42,4 @@ def get_movielens_dataset(variant='100K'):
 
     url = 'movielens_{}'.format(variant)
 
-    return _get_movielens(url)
+    return Interactions(*_get_movielens(url))
diff --git a/spotlight/evaluation.py b/spotlight/evaluation.py
@@ -0,0 +1,29 @@
+import numpy as np
+
+import scipy.stats as st
+
+
+def mrr_score(model, test, train=None):
+
+    test = test.tocsr()
+
+    if train is not None:
+        train = train.tocsr()
+
+    mrrs = []
+
+    for user_id, row in enumerate(test):
+
+        if not len(row.indices):
+            continue
+
+        predictions = -model.predict(user_id)
+
+        if train is not None:
+            predictions[train[user_id].indices] = np.finfo(np.float32).max
+
+        mrr = (1.0 / st.rankdata(predictions)[row.indices]).mean()
+
+        mrrs.append(mrr)
+
+    return np.array(mrrs)
diff --git a/spotlight/factorization/explicit.py b/spotlight/factorization/explicit.py
@@ -0,0 +1,189 @@
+import numpy as np
+
+import torch
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+import torch.optim as optim
+
+from torch.autograd import Variable
+
+
+from spotlight.factorization.implicit import BilinearNet
+from spotlight.losses import (bpr_loss,
+                              hinge_loss,
+                              pointwise_loss)
+from spotlight.torch_utils import cpu, gpu, minibatch, shuffle
+
+
+class TruncatedBilinearNet(nn.Module):
+
+    def __init__(self, num_users, num_items, embedding_dim, sparse=False):
+        super().__init__()
+
+        self.embedding_dim = embedding_dim
+
+        self.rating_net = BilinearNet(num_users, num_items,
+                                      embedding_dim, sparse=sparse)
+        self.observed_net = BilinearNet(num_users, num_items,
+                                        embedding_dim, sparse=sparse)
+
+        self.stddev = nn.Embedding(1, 1)
+
+    def forward(self, user_ids, item_ids):
+
+        observed = F.sigmoid(self.observed_net(user_ids, item_ids))
+        rating = self.rating_net(user_ids, item_ids)
+        stddev = self.stddev((user_ids < -1).long()).view(-1, 1)
+
+        return observed, rating, stddev
+
+
+class ExplicitFactorizationModel(object):
+    """
+    A number of classic factorization models, implemented in PyTorch.
+
+    Available loss functions:
+    - pointwise logistic
+    - BPR: Rendle's personalized Bayesian ranking
+    - adaptive: a variant of WARP with adaptive selection of negative samples
+    - regression: minimizing the regression loss between true and predicted ratings
+    - truncated_regression: truncated regression model, that jointly models
+                            the likelihood of a rating being given and the value
+                            of the rating itself.
+
+    Performance notes: neural network toolkits do not perform well on sparse tasks
+    like recommendations. To achieve acceptable speed, either use the `sparse` option
+    on a CPU or use CUDA with very big minibatches (1024+).
+    """
+
+    def __init__(self,
+                 loss='regression',
+                 embedding_dim=64,
+                 n_iter=3,
+                 batch_size=64,
+                 optimizer=None,
+                 use_cuda=False,
+                 sparse=False):
+
+        assert loss in ('regression'
+                        'truncated_regression')
+
+        self._loss = loss
+        self._embedding_dim = embedding_dim
+        self._n_iter = n_iter
+        self._batch_size = batch_size
+        self._use_cuda = use_cuda
+        self._sparse = sparse
+        self._optimizer = None
+
+        self._num_users = None
+        self._num_items = None
+        self._net = None
+
+    def fit(self, interactions, verbose=False):
+        """
+        Fit the model.
+
+        Arguments
+        ---------
+
+        interactions: np.float32 coo_matrix of shape [n_users, n_items]
+             the matrix containing
+             user-item interactions. The entries can be binary
+             (for implicit tasks) or ratings (for regression
+             and truncated regression).
+        verbose: Bool, optional
+             Whether to print epoch loss statistics.
+        """
+
+        self._num_users, self._num_items = interactions.shape
+
+        if self._loss == 'regression':
+            self._net = gpu(
+                BilinearNet(self._num_users,
+                            self._num_items,
+                            self._embedding_dim,
+                            sparse=self._sparse),
+                self._use_cuda
+            )
+        else:
+            self._net = gpu(
+                TruncatedBilinearNet(self._num_users,
+                                     self._num_items,
+                                     self._embedding_dim,
+                                     sparse=self._sparse),
+                self._use_cuda
+            )
+
+        if self._optimizer is None:
+            self._optimizer = optim.Adam(self._net.parameters())
+
+        if self._loss == 'pointwise':
+            loss_fnc = pointwise_loss
+        elif self._loss == 'bpr':
+            loss_fnc = bpr_loss
+        elif self._loss == 'hinge':
+            loss_fnc = hinge_loss
+
+        for epoch_num in range(self._n_iter):
+
+            users, items, ratings = shuffle(*(interactions.row,
+                                              interactions.col,
+                                              interactions.data))
+
+            user_ids_tensor = gpu(torch.from_numpy(users),
+                                  self._use_cuda)
+            item_ids_tensor = gpu(torch.from_numpy(items),
+                                  self._use_cuda)
+
+            epoch_loss = 0.0
+
+            for (batch_user,
+                 batch_item,
+                 batch_ratings) in minibatch(user_ids_tensor,
+                                             item_ids_tensor,
+                                             batch_size=self._batch_size):
+
+                user_var = Variable(batch_user)
+                item_var = Variable(batch_item)
+                ratings_var = Variable(batch_ratings)
+
+                self._optimizer.zero_grad()
+
+                loss = loss_fnc(user_var, item_var, ratings_var)
+                epoch_loss += loss.data[0]
+
+                loss.backward()
+                self._optimizer.step()
+
+            if verbose:
+                print('Epoch {}: loss {}'.format(epoch_num, epoch_loss))
+
+    def predict(self, user_ids, item_ids):
+        """
+        Compute the recommendation score for user-item pairs.
+
+        Arguments
+        ---------
+
+        user_ids: integer or np.int32 array of shape [n_pairs,]
+             single user id or an array containing the user ids for the user-item pairs for which
+             a prediction is to be computed
+        item_ids: np.int32 array of shape [n_pairs,]
+             an array containing the item ids for the user-item pairs for which
+             a prediction is to be computed.
+        ratings: bool, optional
+             Return predictions on ratings (rather than likelihood of rating)
+        """
+
+        user_ids = torch.from_numpy(user_ids.reshape(-1, 1).astype(np.int64))
+        item_ids = torch.from_numpy(item_ids.reshape(-1, 1).astype(np.int64))
+
+        user_var = Variable(gpu(user_ids, self._use_cuda))
+        item_var = Variable(gpu(item_ids, self._use_cuda))
+
+        out = self._net(user_var, item_var)
+
+        return cpu(out.data).numpy().flatten()