In [None]:
import numpy as np
import random
import csv


In [None]:
class COFISET:
    """
    COFISET which predicts unknown ratings of a movie to any user..
    
    Parameters:
    --------------
    class_boundary : int
        items with ratings [1-class_boudary] considered as unobserved item set I_tr/I_u_tr
        remaining rating items considered as observed set
    p_size : int
        size of set P (Observed Item subset)
    a_size : int
        size of set A (UnObserved Item subset)
    n_factors : int
        Number of latent factors
    gamma : float
        learning rate
    alpha_u : float
        regularisation constant ,hyperparameter
    alpha_v : float
        regularisation constant ,hyperparameter
    beta_v : float
        regularisation constant ,hyperparameter
    
    """
    
    def __init__(self,class_boundary = 3,p_size = 4,a_size = 2,n_factors = 10,gamma = 0.01,alpha_u = 0.01 ,alpha_v = 0.01,beta_v = 0.01):
        self.class_boundary = class_boundary
        self.p_size = p_size
        self.a_size = a_size
        self.n_factors = n_factors
        self.n_users = 943          #fixed for our project's data set - movielens ml-100k
        self.n_items = 1682
        self.gamma = gamma
        self.alpha_u = alpha_u
        self.alpha_v = alpha_v
        self.beta_v = beta_v
       
        # initialisation of pu(user latent factor) matrix , qi(item latent factor) matrix and bi(item bias) vector.....
        r = random.random() # random number r in [0,1)
        self.pu = (np.random.rand(self.n_users,self.n_factors) - 0.5)/150
        self.qi = (np.random.rand(self.n_items,self.n_factors)  - 0.5)/150     
                
        
    def fit(self,filename,T = 10**4):
        self.observed_item_set,self.training_item_set,self.bi = self.load_data(filename)
        #from training data set ....
        
        #initialisation of item bias vector
        self.bi /= self.n_users   
        self.bi -= self.bi.mean()
        
        for t1 in range(0,self.n_users): # put T in 1
            for t2 in range(0,T): #Put self.n_users in 1
                u = random.randrange(0,self.n_users) # randomly choose an user

                #-----exceptions handling......

                if not(u in self.observed_item_set.keys()):
                    continue;

                if(self.p_size > len(self.observed_item_set[u]) or self.a_size > len(self.training_item_set - self.observed_item_set[u])):
                    continue; # P and A sizes must be less than existing items...
                                # (training_item_set - observed_item_set[u]) is unobserved itemset for that user....
                #-- random sampling of observed and unobserved item subsets for a user
                A = random.sample(self.training_item_set - self.observed_item_set[u],self.a_size)
                P = random.sample(self.observed_item_set[u],self.p_size)

                # --- Gradient Descent Strarts here................

                    #---- For observed Item Subset P
                Vp = np.zeros(self.n_factors,float) # row vector with all 0's
                Bp = 0                  #total bias of observed set

                for i in P:
                    Vp = Vp + self.qi[i,:]
                    Bp = Bp + self.bi[i]
                
                rating_u_p = self.pu[u,:].dot(Vp) + Bp # sum of all predicted ratings for observed set
                rating_u_p_avg = rating_u_p/self.p_size    # average of observed set r_u_p

                    #----- For unobserved Item Subset A
                Va = np.zeros(self.n_factors,float) # row vector with all 0's
                Ba = 0                  #total bias of unobserved set

                for j in A:
                    Va = Va + self.qi[j,:]
                    Ba = Ba + self.bi[j]

                rating_u_a = self.pu[u,:].dot(Va) + Ba
                rating_u_a_avg = rating_u_a/self.a_size

                    #---- loss function....
                loss = -1/(1+np.exp(rating_u_p_avg - rating_u_a_avg)) # -sigma(-R_{u,P,A})

                Vp_avg = Vp/self.p_size
                Va_avg = Va/self.a_size

                    # ---- update latent vectors of observed items.....
                for i in P:         # do it before updation of user latent vector...
                    self.qi[i,:] -= self.gamma * (loss/self.p_size * self.pu[u,:] + self.alpha_v*self.qi[i,:])
                    self.bi[i] -= self.gamma * (loss/self.p_size + self.beta_v * self.bi[i])

                for j in A:
                    self.qi[j,:] -= self.gamma * (-loss/self.a_size * self.pu[u,:] + self.alpha_v*self.qi[j,:])
                    self.bi[j] -= self.gamma * (-loss/self.a_size + self.beta_v * self.bi[j]) 

                self.pu[u,:] -= self.gamma * (loss * (Vp_avg - Va_avg) + self.alpha_u * self.pu[u,:])
            print("Epoch - {}".format(t1+1),end=" ")
    
    # Transform user ids and item ids by substracting 1 from it .....

    def load_data(self,filename): # returns observed and unobserved item sets for each user
        observed_item_set = dict() # observed item set for each user... I_u_tr or I_u_te
        total_item_set = set()  # Set of items that are observed by any user... I_tr or I_te
                                # to create empty set, use set() function
        bi = np.zeros(self.n_items,float)
                                # creating bias vector for all items
        with open(filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            # each row is in format of user_id,item_id,rating,timestamp
            for row in csv_reader:
                row[0] = int(row[0])-1 ;row[1] = int(row[1])-1;row[2] = int(row[2])
                if(row[2] > self.class_boundary): #keep it in observed set dictionary
                    bi[row[1]] += 1    # increment count of training user w.r.t item i....
                    if(row[0] in observed_item_set.keys()): # user is already present add this item id 
                        observed_item_set[row[0]].add(row[1])
                    else:  #user is not present,create entry for that user in observed set
                        observed_item_set[row[0]] = {row[1]}  #creating new set for a user with item id
                        
                    total_item_set.add(row[1])
                 
        return observed_item_set,total_item_set,bi
        
def evaluate(self,filename):
        # all these are testing related data ...
        predicted = self.pu.dot(self.qi.transpose()) + self.bi
        
        test_observed,test_item_set,junk = self.load_data(filename)
        test_user_count = len(test_observed.keys())
        # self.observed_item_set,self.training_item_set are related to training data.....
        precision = 0 
        NDCG = 0 ; max_dcg = 1;
        MRR = 0
        ARP = 0
        AUC = 0
        for user in test_observed.keys():
            dcg = 0;rr = 0;rp = 0;auc = 0;
            predicted_user_ratings = predicted[user,:]
            predicted_user_ratings[list(self.observed_item_set[user])] = 0 
                # making training observed item rating as 0
            rank_list = list(np.argsort(predicted_user_ratings)[-1::-1])   # ranking of items by their scores.. It stores item indices in ascending
           
            for l in range(0,5): # Taking top 5 recommendations... 
                if rank_list[l] in test_observed[user]:
                    precision += 1
                    dcg += 1/np.log2(l+2)  # what is the base... l strarts from 0 here
                    
            NDCG += dcg
            if(dcg > max_dcg):
                max_dcg= dcg    
            
            test_observed_count = 0
            rank = 1
            for item in rank_list:
                if(item in test_observed[user]):
                    test_observed_count  += 1
                    rp += rank
                    if(rr == 0):  # take first test set item's position for a user
                        rr = rank

                elif(predicted[user,item] != 0):
                    auc +=test_observed_count;
                rank +=1
                
            MRR += (1/rr)    
            AUC += auc/(len(test_observed[user]) * (1682-len(test_observed[user])-len(self.observed_item_set[user])))
            ARP =+ rp/(len(test_observed[user]) * (len(self.training_item_set) - len(self.observed_item_set[user])))

        precision /= (5*test_user_count)
        NDCG /= (max_dcg*test_user_count)
        MRR /= test_user_count
        ARP /= test_user_count
        AUC /= test_user_count
        print("d \talpha_u | alpha_v | beta_v |   Pre@5   |   NDCG@5|    MRR    |   ARP   |   AUC   |")
        print("COFISET {}\t| {}\t  | {}   |  {:.4f}   |   {:.4f}|  {:.4f}   |  {:.4f} |  {:.4f} |".format(self.alpha_u,self.alpha_v,
                                                                                            self.beta_v,precision,NDCG,MRR,ARP,AUC))

In [None]:
%%time
cofiset = COFISET()
cofiset.fit(r"/kaggle/input/u.base")

In [None]:
def evaluate(self,filename):
        # all these are testing related data ...
        predicted = self.pu.dot(self.qi.transpose()) + self.bi
        
        test_observed,test_item_set,junk = self.load_data(filename)
        test_user_count = len(test_observed.keys())
        # self.observed_item_set,self.training_item_set are related to training data.....
        precision = 0 
        NDCG = 0 ; max_dcg = 1;
        MRR = 0
        ARP = 0
        AUC = 0
        for user in test_observed.keys():
            dcg = 0;rr = 0;rp = 0;auc = 0;
            predicted_user_ratings = predicted[user,:]
            predicted_user_ratings[list(self.observed_item_set[user])] = 0 
                # making training observed item rating as 0
            rank_list = list(np.argsort(predicted_user_ratings)[-1::-1])   # ranking of items by their scores.. It stores item indices in ascending
           
            for l in range(0,5): # Taking top 5 recommendations... 
                if rank_list[l] in test_observed[user]:
                    precision += 1
                    dcg += 1/np.log2(l+2)  # what is the base... l strarts from 0 here
                    
            NDCG += dcg
            if(dcg > max_dcg):
                max_dcg= dcg    
            
            test_observed_count = 0
            rank = 1
            for item in rank_list:
                if(item in test_observed[user]):
                    test_observed_count  += 1
                    rp += rank
                    if(rr == 0):  # take first test set item's position for a user
                        rr = rank

                elif(predicted[user,item] != 0):
                    auc +=test_observed_count;
                rank +=1
                
            MRR += (1/rr)    
            AUC += auc/(len(test_observed[user]) * (1682-len(test_observed[user])-len(self.observed_item_set[user])))
            ARP += rp/(len(test_observed[user]) * (len(self.training_item_set) - len(self.observed_item_set[user])))

        precision /= (5*test_user_count)
        NDCG /= (max_dcg*test_user_count)
        MRR /= test_user_count
        ARP /= test_user_count
        AUC /= test_user_count
        print("d \talpha_u | alpha_v | beta_v |   Pre@5   |   NDCG@5|    MRR    |   ARP   |   AUC   |")
        print("COFISET {}\t| {}\t  | {}   |  {:.4f}   |   {:.4f}|  {:.4f}   |  {:.4f} |  {:.4f} |".format(self.alpha_u,self.alpha_v,
                                                                                            self.beta_v,precision,NDCG,MRR,ARP,AUC))

In [None]:
%%time
evaluate(cofiset,r"/kaggle/input/u.test")