In [16]:
import soydata
from soydata.data.external.movielens import load_rating

print(f'soydata=={soydata.__version__}\n')

user_item, _ = load_rating(size='20m')
print(user_item.shape)

soydata==0.1.0

This function downloads MovieLens data from GroupLens
Please read first http://files.grouplens.org/datasets/movielens/ml-20m-README.html
All permissions are in GroupLens, and this function is an external utility to conventiently use MovieLens data.

(138494, 131263)


In [17]:
import numpy as np

user_indices, item_indices = user_item.nonzero()
user_size = np.bincount(user_indices, minlength=user_item.shape[0])
item_size = np.bincount(item_indices, minlength=user_item.shape[1])

n_empty_user = np.where(user_size == 0)[0].shape[0]
n_empty_item = np.where(item_size == 0)[0].shape[0]

print(f'n empty user = {n_empty_user}')
print(f'n empty item = {n_empty_item}')

n empty user = 1
n empty item = 104519


In [18]:
from sklearn.metrics import pairwise_distances


class SimpleSimilarFinder:
    def __init__(self, matrix, metric='cosine'):
        self.matrix = matrix
        self.metric = metric
        self.n_rows, self.n_features = matrix.shape

    def get_similars(self, idx, topk=10):
        if not (0 <= idx < self.n_rows):
            raise ValueError(f'Unknown query: {idx}')
        query_vec = self.matrix[idx,:]
        return self.get_similars_from_vector(query_vec, topk)

    def get_similars_from_vector(self, query, topk=10):
        if len(query.shape) == 1:
            query = query.reshape(1,-1)

        dist = pairwise_distances(query, self.matrix, metric=self.metric)
        similar_idx = dist.argsort(axis=1)[:,:topk]

        cols = similar_idx.flatten()
        rows = np.repeat(np.arange(query.shape[0]), repeats=topk)
        similar_dist = dist[rows, cols].reshape(-1,topk)

        return similar_idx, similar_dist

In [19]:
user_index = SimpleSimilarFinder(user_item)
similar_users, similar_dist = user_index.get_similars(1)
similar_sim = 1 - similar_dist

print(similar_users.shape)
print(similar_dist.shape)
print(similar_sim.shape)

candidates_history = user_item[similar_users.flatten()]
print(candidates_history.shape)

(1, 10)
(1, 10)
(1, 10)
(10, 131263)


In [20]:
from time import time


class CollaborativeFiltering:
    def __init__(self, user_index, user_history):
        self.user_index = user_index
        self.user_history = user_history
        self.n_users = user_history.shape[0]
        self.popular_by_scores, self.popular_by_view = prepare_most_popular(user_history)

    def recommender(self, user_idx, topk_users=10, topk_items=3):
        t = time()
        _, already_seen = self.user_history[user_idx,:].nonzero()

        similar_users, similar_dist = self.user_index.get_similars(user_idx, topk_users)
        similar_sim = 1 - similar_dist
        candidates_history = self.user_history[similar_users.flatten()]

        # with user rating
        # shape = (1, num items)
        candidate_scores = np.dot(similar_sim, candidates_history.todense())
        # casting numpy.matrix -> numpy.ndarray
        candidate_scores = np.array(candidate_scores).reshape(-1)

        # remove already seen items
        candidate_scores[already_seen] = 0
        candidate_indices = candidate_scores.argsort()[::-1]

        # remove score-zero items
        n_positive = np.where(candidate_scores > 0)[0].shape[0]
        recommended_indices = candidate_indices[:n_positive]

        # re-ranking
        # TODO

        # select top items
        if topk_items > 0:
            recommended_indices = recommended_indices[:topk_items]

        # with score
        recommended_scores = candidate_scores[recommended_indices]

        # check processing time
        t = time() - t
        n_reco = recommended_scores.shape[0]
        print(f'recommend {n_reco} items from {n_positive} items with {t:.4} sec.')

        return recommended_indices, recommended_scores

    def most_popular(self, topk, by='score'):
        if by == 'score':
            self.popular_by_scores[:topk]
        return self.popular_by_view[:topk]

def prepare_most_popular(user_history):
    score_sum = user_history.sum(axis=0).reshape(-1)
    view_count = np.bincount(
        user_history.nonzero()[1],
        minlength = user_history.shape[1]
    )
    popular_by_scores = score_sum.argsort()[::-1]
    popular_by_view = view_count.argsort()[::-1]
    return popular_by_scores, popular_by_view

In [21]:
cf = CollaborativeFiltering(user_index, user_item)
items, scores = cf.recommender(1, topk_users=5, topk_items=10)

recommend 10 items from 494 items with 0.5786 sec.


In [22]:
%%time

# with SVD
from sklearn.decomposition import TruncatedSVD

user_svd = TruncatedSVD(n_components=100).fit_transform(user_item)
user_svd.shape

CPU times: user 44.6 s, sys: 2.21 s, total: 46.8 s
Wall time: 20 s


In [23]:
user_index_svd = SimpleSimilarFinder(user_svd)
cf_svd = CollaborativeFiltering(user_index_svd, user_item)
items, scores = cf_svd.recommender(1, topk_users=5, topk_items=10)

recommend 10 items from 559 items with 0.1007 sec.


In [24]:
from sklearn.preprocessing import normalize
from sklearn.neighbors import BallTree

user_svd_norm = normalize(user_svd)
# ball_tree = BallTree(user_svd, metric='cosine')
ball_tree = BallTree(user_svd_norm, metric='euclidean')

In [25]:
%%time
similar_euc, similar_idxs = ball_tree.query(user_svd_norm[1].reshape(1,-1), k=10, return_distance=True)

CPU times: user 56 ms, sys: 0 ns, total: 56 ms
Wall time: 56.2 ms


In [26]:
similar_cos = (similar_euc ** 2) / 2

print(similar_idxs)
print(similar_euc)
print(similar_cos)

[[     1 110069  13605   5366   7412  62235  75328  47783  50679  13687]]
[[0.         0.65692462 0.6658841  0.67455978 0.69233518 0.69767421
  0.69845812 0.701774   0.70283821 0.70476037]]
[[0.         0.21577498 0.22170081 0.22751545 0.239664   0.24337465
  0.24392187 0.24624338 0.24699078 0.24834359]]


In [27]:
from sklearn.metrics import pairwise_distances

dist = pairwise_distances(user_svd_norm[1].reshape(1,-1), user_svd, metric='cosine').reshape(-1)
idxs = dist.argsort()[:10]
dist = dist[idxs]

print(idxs)
print(dist)

[     1 110069  13605   5366   7412  62235  75328  47783  50679  13687]
[0.         0.21577498 0.22170081 0.22751545 0.239664   0.24337465
 0.24392187 0.24624338 0.24699078 0.24834359]


In [28]:
from sklearn.neighbors import BallTree


class BallTreeSimilarFinder:
    def __init__(self, matrix, leaf_size=40, **kargs):
        self.matrix = matrix
        self.index = BallTree(matrix, leaf_size, 'euclidean', **kargs)
        self.n_rows, self.n_features = matrix.shape

    def get_similars(self, idx, topk=10):
        if not (0 <= idx < self.n_rows):
            raise ValueError(f'Unknown query: {idx}')
        query_vec = self.matrix[idx,:]
        return self.get_similars_from_vector(query_vec, topk)

    def get_similars_from_vector(self, query, topk=10):
        if len(query.shape) == 1:
            query = query.reshape(1,-1)

        similar_euc, similar_idx = self.index.query(query, topk, return_distance=True)
        similar_dist = (similar_euc ** 2) / 2
        return similar_idx, similar_dist

In [29]:
user_index_bt = BallTreeSimilarFinder(user_svd_norm)
cf_bt = CollaborativeFiltering(user_index_bt, user_item)
items, scores = cf_bt.recommender(1, topk_users=5, topk_items=10)

recommend 10 items from 559 items with 0.03913 sec.
