# Item-Based Collaborative Filtering

In [1]:
# Data manipulation
import pandas as pd
import numpy as np
import numba
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, Predictor, als, basic, user_knn
from lenskit import topn
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from lenskit.data import sparse_ratings

# Dataset
from lenskit.datasets import ML100K
movielens = ML100K('../ml-100k')

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Visualizations and debugging
import plotly.graph_objs as go
#from pprintpp import pprint as pp
import logging

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

In [2]:
from lenskit.datasets import ML100K
movielens = ML100K('ml-100k')

In [3]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [4]:
# Data import
ratings = movielens.ratings
uir, users, items = sparse_ratings(ratings, scipy=True)
M = uir
lambda1,lambda2,tol = 1.0,10.0,1e-3
X_o=csr_matrix(M.shape)
Z=csr_matrix(M.shape)
Gamma=csr_matrix(M.shape)
M_s = (uir!=0)

In [5]:
class ItemBasedCosinSimilarity(Recommender, Predictor):
    """
    Recommend new items by finding items that are the most similar to already rated items by users using the cosin distance formula

    """
    def __init__(self, n_neighbors = 11, min_neighbors=1, min_sim=0, alpha=0.5, explore_percent=0.3, selector = None):
        
        # Set selector
        if selector is None:
            self.selector = basic.UnratedItemCandidateSelector()
        else:
            self.selector = selector
            
        # Set parameters
        self.min_neighbors = min_neighbors
        self.min_sim = min_sim
        self.n_neighbors = n_neighbors
        
        # Determines the weight given to normalized popularity
        self.alpha = alpha

        self.explore_percent = explore_percent
        
        # Enable logging 
        _logger = logging.getLogger(__name__)
        
    def __str__(self):
        return 'ItemBasedCosinSimilarity'
            
    # Store the ratings matrix in sparse format and generate similarity matrix
    def fit(self, ratings, **kwargs):
        
        # Get sparse representation in CSR format
        uir, users, items = sparse_ratings(ratings, scipy=True)
        
        # Store ratings
        self.rating_matrix_ = uir
        self.user_index_ = users
        self.item_index_ = items
        self.rating_matrix_iur = uir.T
        
        knn = NearestNeighbors(metric='cosine', algorithm='brute')
        knn.fit(self.rating_matrix_iur)
        distances, indices = knn.kneighbors(self.rating_matrix_iur, n_neighbors=self.n_neighbors)
        
        self.distances = distances
        self.indices = indices
        
        # Reduce candidate space to unseen items
        self.selector.fit(ratings)
    
    # Add a user to the ratings matrix
    def add_user(self, user_id):
        
        # Check if user_id to be added already exists
        try:
            assert (user_id in self.user_index_) == False, "User ID already exists! Not adding anything..."
        
        except AssertionError as e:
            print(e)
            exit(1)

        # Build a sparse matrix of length of number of items
        tmp_sparse_row = sparse.csr_matrix(np.zeros((1,len(self.item_index_))))

        # Vertically stack temporary matrix to original matrix
        self.rating_matrix_ = sparse.vstack([self.rating_matrix_, tmp_sparse_row])
        
        # Update user index
        self.user_index_ = self.user_index_.append(pd.Index([user_id]))
    
        
    # Add a item to the ratings matrix
    def add_item(self, item_id):
        
        # Check if item_id to be added already exists
        try:
            assert (item_id in self.item_index_) == False, "Item ID already exists!"
        
        except AssertionError as e:
            print(e)
            exit(1)
        
        # Build a sparse matrix of length of number of users
        tmp_sparse_col = sparse.csr_matrix(np.zeros((len(self.user_index_),1)))
        
        # Horizotnally stack temporary matrix to original matrix
        self.rating_matrix_ = sparse.hstack([self.rating_matrix_, tmp_sparse_col])
        
        # Update item index
        self.item_index_ = self.item_index_.append(pd.Index([item_id]))
        
        
    # Add a user-item interaction for existing users and items
    def add_interactions(self, user_id, item_id, rating):
    
        # Check if inputs are lists and all input list lengths are equal
        assert type(user_id) == list, "Input user_id is not a list"
        assert type(item_id) == list , "Input item_id is not a list"
        assert type(rating) == list, "Input rating is not a list"
        assert len(user_id) == len(item_id) == len(rating), "Input lists are not of the same length"
        
        # Build a temporary sparse LIL matrix
        
        tmp_ratings = sparse.lil_matrix(self.rating_matrix_.shape)
        
        for i in range(len(user_id)):
            
            # Obtain locations from ID
            user_pos, = np.where(self.user_index_ == user_id[i])[0]
            item_pos, = np.where(self.item_index_ == item_id[i])[0]
            
            # Fill into temporary sparse matrix
            tmp_ratings[user_pos, item_pos] = rating[i]
                    
        # Convert temporary LIL to CSR
        tmp_ratings = tmp_ratings.tocsr()
        
        # Add temporary CSR to main ratings matrix
        self.rating_matrix_ += tmp_ratings

    # Provide a recommendation of top "n" movies given "user"
    # The recommender uses the UnratedItemCandidateSelector by default and uses the ratings matrix 
    # it was originally fit on
    def recommend(self, user_id, candidates=None, ratings=None):
        
        # Reduce candidate space and store candidates with item ID
        if candidates is None:
            candidates = self.selector.candidates(user_id, ratings)
        
        # Grab user index for given user_id
        user_index, = np.where(self.user_index_ == user_id)[0]
        
        # Predict ratings and scores for all unseen items
        prediction_score_df = self.predict_for_user(user_index, candidates)
        
        return(prediction_score_df) 
    
    # Modified from https://towardsdatascience.com/item-based-collaborative-filtering-in-python-91f747200fab
    def predict_for_user(self, user, items):
        
        # Instantiate ratings and item_popularity vectors
        predicted_ratings = np.zeros(len(items), dtype=float)
        item_popularity = np.zeros(len(items), dtype=float)
        
        coo_ratings = self.rating_matrix_.tocoo()
        rating_matrix_users = coo_ratings.row
        rating_matrix_items = coo_ratings.col
        rating_matrix_data = coo_ratings.data
        
        iur = self.rating_matrix_iur
        iur_copy = iur.copy()
        for i in range(len(items)):

            m = self.item_index_.get_loc(items[i])
            sim_movies = self.indices[m].tolist()
            movie_distances = self.distances[m].tolist()

            # Generally, this is the case: indices[3] = [3 6 7]. The movie itself is in the first place.
            # In this case, we take off 3 from the list. Then, indices[3] == [6 7] to have the nearest NEIGHBORS in the list. 
            if m in sim_movies:
              id_movie = sim_movies.index(m)
              sim_movies.remove(m)
              movie_distances.pop(id_movie) 

            # However, if the percentage of ratings in the dataset is very low, there are too many 0s in the dataset. 
            # Some movies have all 0 ratings and the movies with all 0s are considered the same movies by NearestNeighbors(). 
            # Then,even the movie itself cannot be included in the indices. 
            # For example, indices[3] = [2 4 7] is possible if movie_2, movie_3, movie_4, and movie_7 have all 0s for their ratings.
            # In that case, we take off the farthest movie in the list. Therefore, 7 is taken off from the list, then indices[3] == [2 4].
            else:
              sim_movies = sim_movies[:self.n_neighbors-1]
              movie_distances = movie_distances[:self.n_neighbors-1]

            # movie_similarty = 1 - movie_distance    
            movie_similarity = [1-x for x in movie_distances]
            movie_similarity_copy = movie_similarity.copy()
            nominator = 0

            # for each similar movie
            for s in range(0, len(movie_similarity)):

              # check if the rating of a similar movie is zero
              if iur[sim_movies[s], user] == 0:

                # if the rating is zero, ignore the rating and the similarity in calculating the predicted rating
                if len(movie_similarity_copy) == (self.n_neighbors - 1):
                  movie_similarity_copy.pop(s)

                else:
                  movie_similarity_copy.pop(s-(len(movie_similarity)-len(movie_similarity_copy)))

              # if the rating is not zero, use the rating and similarity in the calculation
              else:
                nominator = nominator + movie_similarity[s]*iur[sim_movies[s],user]

            # check if the number of the ratings with non-zero is positive
            if len(movie_similarity_copy) > 0:

              # check if the sum of the ratings of the similar movies is positive.
              if sum(movie_similarity_copy) > 0:
                predicted_r = nominator/sum(movie_similarity_copy)

              # Even if there are some movies for which the ratings are positive, some movies have zero similarity even though they are selected as similar movies.
              # in this case, the predicted rating becomes zero as well  
              else:
                predicted_r = 0

            # if all the ratings of the similar movies are zero, then predicted rating should be zero
            else:
              predicted_r = 0

          # place the predicted rating into the copy of the original dataset
#            iur_copy[m,user_index] = predicted_r
            predicted_ratings[i] = predicted_r
            
            # Item position given item i ID
            item_pos = self.item_index_.get_loc(items[i])
            
            # Locations of ratings for item_pos
            rating_locations, = np.where(rating_matrix_items == item_pos)
            
            # Store popularity of item based on number of total ratings 
            item_popularity[i] = len(rating_locations)
        
        # minmax scale the popularity of each item
        normalized_popularity = np.interp(item_popularity, (item_popularity.min(), item_popularity.max()), (0, +1))
        score = np.multiply(normalized_popularity, predicted_ratings)
        
        results = {'predicted_ratings':predicted_ratings, 'normalized_popularity':normalized_popularity}
        return pd.DataFrame(results, index=items)
            


In [6]:
%%time

# Instantiate object
algo_mf = ItemBasedCosinSimilarity()

# Reduce the candidates space + build user-user cosin similarity matrix 
algo_mf.fit(ratings)

CPU times: user 478 ms, sys: 31.5 ms, total: 510 ms
Wall time: 510 ms


In [7]:
%%time

# Ask for rating predictions for u users
for u in range (1,10):
    recs = algo_mf.recommend(u)

CPU times: user 3.85 s, sys: 22.3 ms, total: 3.87 s
Wall time: 3.91 s


In [8]:
# View the last set of recommendations
recs.sort_values(
    by=["predicted_ratings", "normalized_popularity"],
    ascending=False
)[["predicted_ratings", "normalized_popularity"]].head(20)

Unnamed: 0,predicted_ratings,normalized_popularity
515,5.0,0.393701
255,5.0,0.336614
249,5.0,0.253937
510,5.0,0.23622
490,5.0,0.096457
1258,5.0,0.043307
181,5.0,0.996063
1,5.0,0.887795
121,5.0,0.84252
174,5.0,0.824803
