In [1]:
import os
import numpy as np
import random
import scipy.sparse as sparse
from scipy.sparse import *
from sklearn.utils import shuffle
import pandas as pd
from math import ceil
from tqdm import trange
from sklearn.metrics import *
import sys
import pickle
from itertools import islice

In [3]:
datasets = ['Ciao, filmtrust, Lthing, Epinions']
file_dir = '/Users/payoj/Documents/Spring 2019/Recommender Systems/project/data/'+datasets[0]
# file_path = os.path.join(file_dir, 'u.data')

# we will not be using the timestamp column
pos_file = open(file_dir+"/positive_feedback_dataframe.pkl",'rb')
soc_file = open(file_dir+"/social_positive_feedback_dataframe.pkl",'rb')
positive_df = pickle.load(pos_file)
social_positive_df = pickle.load(soc_file)

In [4]:
print('data dimension: \n', positive_df.shape, social_positive_df.shape)

data dimension: 
 (34130, 3) (408247, 2)


In [3]:
rstate = np.random.RandomState(20190415)
# ratings = df[:660000]
# msk = rstate.rand(len(ratings)) <= 0.8
# train = ratings[msk]
# test = ratings[~msk]

In [4]:
# file_dir = '/Users/poorwa.hirve/Documents/CU/RecSys/social-network-recommender-system/ml-100k'
# file_path = os.path.join(file_dir, 'u.data')

# # we will not be using the timestamp column
# names = ['user_id', 'item_id', 'rating', 'timestamp']
# df = pd.read_csv(file_path, sep = '\t', names = names)
# df = df.drop(columns='timestamp')
# print('data dimension: \n', df.shape)
# df.head()

In [5]:
def create_matrix(data, users_col, items_col, ratings_col):
    """
    creates the sparse user-item interaction matrix,
    if the data is not in the format where the interaction only
    contains the positive items (indicated by 1), then use the 
    threshold parameter to determine which items are considered positive
    
    Parameters
    ----------
    data : DataFrame
        implicit rating data

    users_col : str
        user column name

    items_col : str
        item column name
    
    ratings_col : str
        implicit rating column name

    Returns
    -------
    ratings : scipy sparse csr_matrix, shape [n_users, n_items]
        user/item ratings matrix

    data : DataFrame
        implict rating data that retains only the positive feedback
        (if specified to do so)
    """
    for col in (items_col, users_col, ratings_col):
        data[col] = data[col].astype('category')

    ratings = csr_matrix((data[ratings_col],
                          (data[users_col].cat.codes, data[items_col].cat.codes)))
    ratings.eliminate_zeros()
    return ratings, data

In [6]:
# X = sparse.load_npz("yourmatrix.npz")

In [7]:
items_col = 'item'
users_col = 'user'
ratings_col = 'rating'
postive_X, positive_df = create_matrix(positive_df, users_col, items_col, ratings_col)

In [9]:
positive_df.head()

Unnamed: 0,user,item,rating
0,1,1,5
1,2,1,5
3,4,1,4
4,5,1,4
5,6,1,5


In [10]:
def create_train_test(ratings, test_size = 0.2, seed = 1234):
    """
    split the user-item interactions matrix into train and test set
    by removing some of the interactions from every user and pretend
    that we never saw them
    
    Parameters
    ----------
    ratings : scipy sparse csr_matrix, shape [n_users, n_items]
        The user-item interactions matrix
    
    test_size : float between 0.0 and 1.0, default 0.2
        Proportion of the user-item interactions for each user
        in the dataset to move to the test set; e.g. if set to 0.2
        and a user has 10 interactions, then 2 will be moved to the
        test set
    
    seed : int, default 1234
        Seed for reproducible random splitting the 
        data into train/test set
    
    Returns
    ------- 
    train : scipy sparse csr_matrix, shape [n_users, n_items]
        Training set
    
    test : scipy sparse csr_matrix, shape [n_users, n_items]
        Test set
    """
    assert test_size < 1.0 and test_size > 0.0

    # Dictionary Of Keys based sparse matrix is more efficient
    # for constructing sparse matrices incrementally compared with csr_matrix
    
    train = ratings.copy().tocsr()
    test = csr_matrix(train.shape)
    
    # for all the users assign randomly chosen interactions
    # to the test and assign those interactions to zero in the training;
    # when computing the interactions to go into the test set, 
    # remember to round up the numbers (e.g. a user has 4 ratings, if the
    # test_size is 0.2, then 0.8 ratings will go to test, thus we need to
    # round up to ensure the test set gets at least 1 rating)
    
    rstate = np.random.RandomState(seed)
    for u in range(ratings.shape[0]):
        split_index = ratings[u].indices
        n_splits = ceil(test_size * split_index.shape[0])
        test_index = rstate.choice(split_index, size = n_splits, replace = False)
        test[u, test_index] = ratings[u, test_index]
        train[u, test_index] = 0
    
    train, test = train.tocsr(), test.tocsr()
    return train, test

In [11]:
X_train, X_test = create_train_test(postive_X, test_size = 0.1, seed = 20191004)



In [13]:
print (X_train.shape, X_test.shape)

(2104, 10768) (2104, 10768)


In [14]:
print (X_train.getnnz(), X_test.getnnz())

33944 4271


In [16]:
class SBPR2:
    """
    Bayesian Personalized Ranking (BPR) for implicit feedback data

    Parameters
    ----------
    learning_rate : float, default 0.01
        learning rate for gradient descent

    n_factors : int, default 20
        Number/dimension of user and item latent factors

    n_iters : int, default 15
        Number of iterations to train the algorithm
        
    batch_size : int, default 1000
        batch size for batch gradient descent, the original paper
        uses stochastic gradient descent (i.e., batch size of 1),
        but this can make the training unstable (very sensitive to
        learning rate)

    reg : int, default 0.01
        Regularization term for the user and item latent factors

    seed : int, default 1234
        Seed for the randomly initialized user, item latent factors

    verbose : bool, default True
        Whether to print progress bar while training

    Attributes
    ----------
    user_factors : 2d ndarray, shape [n_users, n_factors]
        User latent factors learnt

    item_factors : 2d ndarray, shape [n_items, n_factors]
        Item latent factors learnt

    References
    ----------
    S. Rendle, C. Freudenthaler, Z. Gantner, L. Schmidt-Thieme 
    Bayesian Personalized Ranking from Implicit Feedback
    - https://arxiv.org/abs/1205.2618
    """
    def __init__(self, learning_rate = 0.01, n_factors = 15, n_iters = 10, 
                 batch_size = 1000, social_coefficient = 1, reg_u = 0.015, reg_i = 0.01, reg_k = 0.01, reg_j = 0.01, seed = 1234, verbose = True):
        self.reg_u = reg_u
        self.reg_i = reg_i
        self.reg_k = reg_k
        self.reg_j = reg_j
        self.seed = seed
        self.verbose = verbose
        self.n_iters = n_iters
        self.n_factors = n_factors
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.s_uk = social_coefficient
        
        # to avoid re-computation at predict
        self._prediction = None
        
    def fit(self, ratings):
        """
        Parameters
        ----------
        ratings : scipy sparse csr_matrix, shape [n_users, n_items]
            sparse matrix of user-item interactions
        """
        indptr = ratings.indptr
        indices = ratings.indices
        n_users, n_items = ratings.shape
        
        # ensure batch size makes sense, since the algorithm involves
        # for each step randomly sample a user, thus the batch size
        # should be smaller than the total number of users or else
        # we would be sampling the user with replacement
        batch_size = self.batch_size
        if n_users < batch_size:
            batch_size = n_users
            sys.stderr.write('WARNING: Batch size is greater than number of users,'
                             'switching to a batch size of {}\n'.format(n_users))

        batch_iters = n_users // batch_size
        
        # initialize random weights
        rstate = np.random.RandomState(self.seed)
        self.user_factors = rstate.normal(size = (n_users, self.n_factors))
        self.item_factors = rstate.normal(size = (n_items, self.n_factors))
        
        # progress bar for training iteration if verbose is turned on
        loop = range(self.n_iters)
        if self.verbose:
            loop = trange(self.n_iters, desc = self.__class__.__name__)
        
        for _ in loop:
            for _ in range(batch_iters):
                sampled = self._sample(n_users, n_items, indices, indptr)
                sampled_users, sampled_pos_items, sampled_soc_items, sampled_neg_items = sampled
                self._update(sampled_users, sampled_pos_items, sampled_soc_items, sampled_neg_items)

        return self
    
    def _sample(self, n_users, n_items, indices, indptr):
        """sample batches of random triplets u, i, j"""
        sampled_pos_items = np.zeros(self.batch_size, dtype = np.int)
        sampled_soc_items = np.zeros(self.batch_size, dtype = np.int)
        sampled_neg_items = np.zeros(self.batch_size, dtype = np.int)
        
        sampled_users = np.random.choice(
            n_users, size = self.batch_size, replace = False)
        
        for idx, user in enumerate(sampled_users):
            pos_items = indices[indptr[user]:indptr[user + 1]]
            pos_item = np.random.choice(pos_items)
            
            neg_item = np.random.choice(n_items)
            while neg_item in pos_items:
                neg_item = np.random.choice(n_items)
            
#             Get a social feedback item from negative item\

            sampled_pos_items[idx] = pos_item
            sampled_neg_items[idx] = neg_item
            sampled_soc_items[idx] = soc_item
            
        return sampled_users, sampled_pos_items, sampled_soc_items, sampled_neg_items
                
    def _update(self, u, i, k ,j):
        """
        update according to the bootstrapped user u, 
        positive item i and negative item j
        """
        user_u = self.user_factors[u]
        item_i = self.item_factors[i]
        item_j = self.item_factors[j]
        item_k = self.item_factors[k]
        
        # decompose the estimator, compute the difference between
        # the score of the (positive and social items) and (social and negative items) ; 

        r_ukj = np.sum(user_u * (item_k - item_j), axis = 1)
        sigmoid_ukj = np.exp(-r_ukj) / (1.0 + np.exp(-r_ukj))     # derivation of 1/(1+exp(-x))
        
        r_uik = np.sum(user_u*(item_i - item_k), axis = 1)/(1 + self.s_uk)
        sigmoid_uik = np.exp(-r_uik) / (1.0 + np.exp(-r_uik))     # derivation of 1/(1+exp(-x))
        
        # repeat the 1 dimension sigmoid n_factors times so
        # the dimension will match when doing the update
        
        sigmoid_uik_tiled = np.tile(sigmoid_uik, (self.n_factors, 1)).T
        sigmoid_ukj_tiled = np.tile(sigmoid_ukj, (self.n_factors, 1)).T
        

        # update using gradient descent

        grad_u = sigmoid_uik_tiled * ((item_k - item_i)/(1 + self.s_uk)) + sigmoid_ukj_tiled * (item_j - item_k) + self.reg_u * user_u
        grad_i = sigmoid_uik_tiled * (-user_u)/(1 + self.s_uk) + self.reg_i * item_i
        grad_k = sigmoid_uik_tiled * (user_u/(1 + self.s_uk)) + (sigmoid_ukj_tiled * -user_u) + self.reg_k * item_k
        grad_j = (sigmoid_ukj_tiled * user_u) + self.reg_j * item_j
        
        self.user_factors[u] -= self.learning_rate * grad_u
        self.item_factors[i] -= self.learning_rate * grad_i
        self.item_factors[k] -= self.learning_rate * grad_k
        self.item_factors[j] -= self.learning_rate * grad_j
        
        return self

    def predict(self):
        """
        Obtain the predicted ratings for every users and items
        by doing a dot product of the learnt user and item vectors.
        The result will be cached to avoid re-computing it every time
        we call predict, thus there will only be an overhead the first
        time we call it. Note, ideally you probably don't need to compute
        this as it returns a dense matrix and may take up huge amounts of
        memory for large datasets
        """
        if self._prediction is None:
            self._prediction = self.user_factors.dot(self.item_factors.T)

        return self._prediction

    def _predict_user(self, user):
        """
        returns the predicted ratings for the specified user,
        this is mainly used in computing evaluation metric
        """
        user_pred = self.user_factors[user].dot(self.item_factors.T)
        return user_pred
    
    def recommend(self, data, N = 5):
        """
        Returns the top N ranked items for given user id,
        excluding the ones that the user already liked
        
        Parameters
        ----------
        ratings : scipy sparse csr_matrix, shape [n_users, n_items]
            sparse matrix of user-item interactions 
        
        N : int, default 5
            top-N similar items' N
        
        Returns
        -------
        recommendation : 2d ndarray, shape [number of users, N]
            each row is the top-N ranked item for each query user
        """
        n_users = data.shape[0]
        recommendation = np.zeros((n_users, N))
        scores = np.zeros((n_users, N))
        users = []
        ranks = []
        for user in range(n_users):
            users.append([user+1]*N)
            ranks.append([i for i in range(1,N+1)])
            topN_items, topN_scores = self.recommend_user(data, user, N)
            recommendation[user], scores[user] = topN_items, topN_scores

        return recommendation, scores, users, ranks
    
    def get_item_ratings(self, data, u):
        
        if u not in self.item_ratings:
            items = data[u].indices
            ratings = data[u].data
            
            self.item_ratings[u] = []
            for i in range(len(items)):
                self.item_ratings[u].append((items[i], ratings[i]))
                
        return self.item_ratings[u]
        

    def recommend_user(self, data, u, N, validation = True):
        """the top-N ranked items for a given user"""
        scores = self._predict_user(u)

        # compute the top N items, removing the items that the user already liked
        # from the result and ensure that we don't get out of bounds error when 
        # we ask for more recommendations than that are available
        liked = set(data[u].indices)
        count = N + len(liked)
        if count < scores.shape[0]:

            # when trying to obtain the top-N indices from the score,
            # using argpartition to retrieve the top-N indices in 
            # unsorted order and then sort them will be faster than doing
            # straight up argort on the entire score
            # http://stackoverflow.com/questions/42184499/cannot-understand-numpy-argpartition-output
            ids = np.argpartition(scores, -count)[-count:]
            best_ids = np.argsort(scores[ids])[::-1]
            best = ids[best_ids]
            best_scores = scores[best]
        else:
            best = np.argsort(scores)[::-1]
            best_scores = scores[best]    

        topN_items = []
        topN_scores = []
        for i in range(len(best)):
            if best[i] not in liked:
                topN_items.append(best[i])
                topN_scores.append(best_scores[i])
                
        topN_items = list(islice((item for item in topN_items), N))
        topN_scores = list(islice((score for score in topN_scores), N))
        return topN_items, topN_scores

#     def recommend(self, ratings, N = 5):
#         """
#         Returns the top N ranked items for given user id,
#         excluding the ones that the user already liked
        
#         Parameters
#         ----------
#         ratings : scipy sparse csr_matrix, shape [n_users, n_items]
#             sparse matrix of user-item interactions 
        
#         N : int, default 5
#             top-N similar items' N
        
#         Returns
#         -------
#         recommendation : 2d ndarray, shape [number of users, N]
#             each row is the top-N ranked item for each query user
#         """
#         n_users = ratings.shape[0]
#         recommendation = np.zeros((n_users, N))
#         scores = np.zeros((n_users, N))
#         users = []
#         ranks = []
#         for user in range(n_users):
#             users.append([user+1]*N)
#             ranks.append([i for i in range(1,N+1)])
#             topN_items, topN_scores = self._recommend_user(ratings, user, N)
#             recommendation[user], scores[user] = topN_items, topN_scores

#         return recommendation, scores, users, ranks

#     def _recommend_user(self, ratings, user, N):
#         """the top-N ranked items for a given user"""
#         scores = self._predict_user(user)

#         # compute the top N items, removing the items that the user already liked
#         # from the result and ensure that we don't get out of bounds error when 
#         # we ask for more recommendations than that are available
#         liked = set(ratings[user].indices)
#         count = N + len(liked)
#         if count < scores.shape[0]:

#             # when trying to obtain the top-N indices from the score,
#             # using argpartition to retrieve the top-N indices in 
#             # unsorted order and then sort them will be faster than doing
#             # straight up argort on the entire score
#             # http://stackoverflow.com/questions/42184499/cannot-understand-numpy-argpartition-output
#             ids = np.argpartition(scores, -count)[-count:]
#             best_ids = np.argsort(scores[ids])[::-1]
#             best = ids[best_ids]
#             print (scores[best[0]])
#             best_scores = scores[best]
#         else:
#             best = np.argsort(scores)[::-1]
#             print (scores[best[0]])
#             best_scores = np.argsort(scores)[::-1]

#         topN_items = []
#         topN_scores = []
#         for i in range(len(best)):
#             if best[i] not in liked:
#                 topN_items.append(best[i])
#                 topN_scores.append(best_scores[i])
                
#         topN_items = list(islice((item for item in topN_items), N))
#         topN_scores = list(islice((score for score in topN_scores), N))
#         return topN_items, topN_scores

In [17]:
# parameters were randomly chosen
sbpr_params = {'reg_u': 0.015,
               'reg_i': 0.025,
               'reg_k':0.025,
               'reg_j':0.025,
               'learning_rate': 0.001,
               'n_iters': 1,
               'n_factors': 10,
               'batch_size': 1}

sbpr = SBPR2(**sbpr_params)
sbpr.fit(X_train)

BPR: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.40s/it]


<__main__.BPR at 0x11382ec18>

In [18]:
def auc_score(model, ratings):
    """
    computes area under the ROC curve (AUC).
    The full name should probably be mean
    auc score as it is computing the auc
    for every user's prediction and actual
    interaction and taking the average for
    all users
    
    Parameters
    ----------
    model : BPR instance
        Trained BPR model
        
    ratings : scipy sparse csr_matrix, shape [n_users, n_items]
        sparse matrix of user-item interactions
    
    Returns
    -------
    auc : float 0.0 ~ 1.0
    """
    auc = 0.0
    
    n_users, n_items = ratings.shape
    for user, row in enumerate(ratings):
        y_pred = model._predict_user(user)
        y_true = np.zeros(n_items)
        y_true[row.indices] = 1
        auc += roc_auc_score(y_true, y_pred)
    auc /= n_users
    return auc

In [19]:
# print(auc_score(bpr, X_test))

In [21]:
recommendation, scores, users, ranks = bpr.recommend(X_test, N = 5)

In [28]:
print (len(recommendation))

24174


In [29]:
print (len(scores))

24174


In [30]:
print (len(users))

24174


In [31]:
print (len(ranks))

24174


In [22]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [34]:
df_test = pd.DataFrame({'user': flatten(users), 'item': flatten(recommendation), 'score': flatten(scores), 'rank': flatten(ranks)})


TypeError: only size-1 arrays can be converted to Python scalars

In [35]:
df_test['item'] = df_test['item'].astype(int)

In [36]:
df_test.head()

Unnamed: 0,user,item,score,rank
0,1,69963,14.722504,1
1,1,31660,14.636142,2
2,1,58681,14.338766,3
3,1,93133,14.136444,4
4,1,105175,14.02397,5


In [37]:
test_data = pd.DataFrame(columns = ['user_id', 'item_id', 'rating'])


In [38]:
X_test_np = X_test.toarray()

In [39]:
users = [i+1 for i in range(X_test_np.shape[0])]

In [40]:
items = [i+1 for i in range(X_test_np.shape[1])]

In [41]:
print (len(users), len(items))

24174 111869


In [None]:
f = open('test_data.txt', 'w+')
for user in users:
    for item in items:
        line = str(user) + ' ' + str(item) + ' ' + str(X_test_np[user-1][item-1]) + '\n'
f.close()

In [None]:
test_data['rating'] = test_ratings
test_data['user'] = users
test_data['item'] = items

In [None]:
test_data.head()

In [None]:
len(df_test['item'])

In [None]:
len(test_data['rating'])

In [None]:
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, als, item_knn as knn
from lenskit import topn

In [None]:
df.columns = ['user', 'item', 'rating']
df_test.columns = ['user', 'item', 'score', 'rank']
df_test['Algorithm'] = ['BPR']*len(df_test['item'])

In [None]:
test_data.head()

In [None]:
df_test.head()

In [None]:
df.head()

In [None]:
df['rating'] = np.array(df['rating']).astype(float)

In [None]:
ndcg = topn.ndcg(df_test, df)

In [None]:
ndcg

In [None]:
recall_at_5 = topn.recall(df_test, test_data)

In [None]:
recall_at_5

In [None]:
precision_at_5 = topn.precision(df_test, test_data)
precision_at_5

In [None]:
test_data['item']