**imports and define artifacts files**

In [None]:
from google.colab import drive
import numpy as np
!pip install --quiet zipfile36
import zipfile
import requests
drive.mount('/content/drive')
path='/content/drive/My Drive/Colab Notebooks/'
hist_file = path + 'hist.npy'
params_file = path + 'params.npy'
hist_file_rmse = path + 'hist_rmse.npy'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**load data**

In [None]:

url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
zip_name = 'ml-1m.zip'
data_dir = 'ml-1m/'
r = requests.get(url, allow_redirects=True)
open(zip_name, 'wb').write(r.content)

with zipfile.ZipFile(zip_name,"r") as zip_ref:
  zip_ref.extractall()

fp = open(data_dir + "ratings.dat")

num_items = 3952
num_users = 6040
train_ratio = 0.9
num_total_ratings = 1000209

R = np.zeros((num_users,num_items))
train_R = np.zeros((num_users, num_items))
test_R = np.zeros((num_users, num_items))

random_perm_idx = np.random.permutation(num_total_ratings)
train_idx = random_perm_idx[0:int(num_total_ratings*train_ratio)]
test_idx = random_perm_idx[int(num_total_ratings*train_ratio):]

num_train_ratings = len(train_idx)
num_test_ratings = len(test_idx)

lines = fp.readlines()
for line in lines:
    user,item,rating,_ = line.split("::")
    user_idx = int(user) - 1
    item_idx = int(item) - 1
    R[user_idx,item_idx] = int(rating)

''' Train '''
for itr in train_idx:
    line = lines[itr]
    user,item,rating,_ = line.split("::")
    user_idx = int(user) - 1
    item_idx = int(item) - 1
    train_R[user_idx,item_idx] = int(rating)

''' Test '''
for itr in test_idx:
    line = lines[itr]
    user, item, rating, _ = line.split("::")
    user_idx = int(user) - 1
    item_idx = int(item) - 1
    test_R[user_idx, item_idx] = int(rating)


**define rmse**

In [None]:
    def rmse(df_true,df_pred):
        #we will calculate RMSE only over items which the user ranked, and that the 
        #model ranked. Meaning that if the algorithm doesn't give a prdiction to the 
        #(user, item) couple, we will not include it in the RMSE calculation.
        #don't use here lower bound and upper bound 
        num_of_users = df_true.shape[0]
        RMSE = 0.0;
        #number of ranked items in the test set
        num_of_relevant_items = 0.0
        for i in range(num_of_users):
          ranked_by_user_indices = df_true[i].nonzero()
          relevant_pred = df_pred[i][ranked_by_user_indices] 
          relevant_true = df_true[i][ranked_by_user_indices]
          num_of_relevant_items += len(relevant_pred)
          RMSE += np.sum(np.power(relevant_pred - relevant_true, 2))
        RMSE /= num_of_relevant_items
        RMSE = np.sqrt(RMSE)
        return RMSE

**define matrix factorization**

In [None]:
import time
class MF():


    def __init__(self,df_train,n_factors=10,only_bias=False):
        self.df_train = df_train
        self.n_factors = n_factors
        self.only_bias = only_bias
        # initialize latent vectors from the normal distribution     
        self.users_latent = np.random.normal(scale=1./self.n_factors,\
                                          size=(self.df_train.shape[0], self.n_factors))
        self.items_latent = np.random.normal(scale=1./self.n_factors,
                                          size=(self.df_train.shape[1], self.n_factors))
       
        # rows==users for observed (user, item) couple , cols==items for observed (user,item) couple
        #for each index in range(len(self.n_rows)) the couple (self.n_rows[inedex], self.n_cols[index]) is observed
        self.n_rows, self.n_cols = df_train.nonzero()

        # biases
        self.original_bias_global = np.sum(df_train)/np.count_nonzero(df_train)

        # "reset" initialization 
        self.initilize_params()
        
        
    def initilize_params(self):      
        # initilize bias
        self.bias_global = self.original_bias_global   # no need to learn it
        self.bias_rows   = np.zeros(self.df_train.shape[0]) # random
        self.bias_cols   = np.zeros(self.df_train.shape[1]) # random
  
    def fit(self,
            n_iterations=1,
            learning_rate=1e-1,
            regularization=1e-2,
            convergence=1e-5,
            error='RMSE',
            initilize_training=True,
            verbose=True):
        
        self.n_iterations = n_iterations
        self.α = learning_rate
        self.λ = regularization
        self.ϵ = convergence
        self.error = error
                  
        if initilize_training:
            self.initilize_params()

        # please record your progress in history
        self.history = []
        self.testHist = []
            
        # please record the training time in fit_time
        self.fit_time = time.time()

        # please use the class functions 
        for current_iteration in range(self.n_iterations):
            self.history.append(self.get_rmse(self.df_train))
            self.testHist.append(self.get_rmse(test_R))

            # printing
            if verbose and current_iteration % 1 == 0:
                print('iteration: ',current_iteration,' total error:',self.history[-1], ' test error:', self.testHist[-1])
            # convergence
            # note that in some point we need to update the self.converging
            if current_iteration!=0 and self.converging():
                if verbose:
                    print('converged...')
                break
            #make a SGD step
            self.optim_GD()
        self.fit_time = time.time() - self.fit_time


    def converging(self):
        return np.abs(self.history[len(self.history)-1] -self.history[len(self.history)-2]) < self.ϵ
       
    def optim_GD(self):
        suffeled_indices = np.arange(len(self.n_rows))
        np.random.shuffle(suffeled_indices)
        #iterate randomly over all observed (user, item) couple and update 2k parameters for each one.
        for i in suffeled_indices:  
          user_index = self.n_rows[i]
          item_index = self.n_cols[i]      
          error = self.error_for_couple(user_index, item_index)
          
          #update user (include bias)
          self.step_rows(error, user_index, item_index)
          #update item (include bias)
          self.step_cols(error, user_index, item_index)          
        pass       

    def error_for_couple(self, user_index, item_index):
        t_rating = self.df_train[user_index][item_index] #
        if self.only_bias:
          pred = self.bias_global + self.bias_rows[user_index] + self.bias_cols[item_index]         
        else:   
          pred = self.bias_global + self.bias_rows[user_index] + self.bias_cols[item_index] + \
              (self.users_latent[user_index, :].dot(self.items_latent[item_index, :].T))
        return t_rating - pred  
                  
                  
    def step_rows(self, error, user_index, item_index):
        
        self.bias_rows[user_index] += self.α * (error - self.λ * self.bias_rows[user_index])
        self.bias_cols[item_index] += self.α * (error - self.λ * self.bias_cols[item_index])
        pass
                  
    def step_cols(self, error, user_index, item_index):

        if self.only_bias == False:
          self.users_latent[user_index, : ] += self.α * (error * self.items_latent[item_index, : ] - \
                                      self.λ * self.users_latent[user_index, :])
      
         #update item latent factors
          self.items_latent[item_index, : ] += self.α * (error * self.users_latent[user_index, : ] - \
                                      self.λ * self.items_latent[item_index, :])
        pass

    def predict(self,N=10):
        #for each user - calculate all item's ratings. 

        #first intialize the df that will hold all predicted ratings
        recommended = train_R.copy()
        recommended[:] = self.bias_global
        if self.only_bias == False:
          recommended += np.matmul(self.users_latent, self.items_latent.T)
        #add user bias
        recommended += self.bias_rows[:, None]
        #add item bias
        recommended += self.bias_cols
        return recommended
        
    def get_rmse(self,test):
        #predict
        recommended = MF.predict(self) 
        return rmse(test, recommended)

    

**run the models**

In [None]:
import pandas as pd
from numpy import save
n_iterations   = [40] # just one value due to computational power
convergence    = [1e-5,1e-3]  
latent_factors = [10, 20, 40]
regularization = [0.01, 0.1, 1]
learning_rate  = [0.001, 0.01]

results = pd.DataFrame(columns=['Latent_factors','Iterations','Learning_Rate','Regularization','Convergence','RMSE','train History','test History','Time'])
res =[]
res1 = []
params = []
for n_iter in n_iterations:
    for lf in latent_factors:
        for lr in learning_rate:
            for reg in regularization:
                for conv in convergence:
                    params.append([n_iter, lf, lr, reg, conv])
                    dic = {'Iterations':n_iter,'Latent_factors':lf,'Learning_Rate':lr,'Regularization':reg,'Convergence':conv}

                    # initialize model
                    mf = MF(train_R,n_factors=lf, only_bias=False)
                    
                    # record training time
                    start = time.time()
                    
                    # train MF
                    mf.fit(n_iterations=n_iter,verbose=True ,learning_rate=lr,regularization=reg,convergence=conv)
                    
                    # record traning time
                    dic['Time'] = [time.time() - start]
                    
                    # eval model
                    # please use the model evaluation funcntions 
                    dic['RMSE'] = [mf.get_rmse(test_R)]
                    # record model history
                    dic['train History'] = mf.history # model 
                    dic['test History'] = mf.testHist # model 
                    
                    # add experiment
                    res.append(mf.get_rmse(test_R))
                    res1.append([mf.history, mf.history])
                    save(hist_file_rmse, res)
                    save(hist_file, res1)
                    save(params_file, params)

                    #results = results.append(dic,ignore_index=True)
                    #store['df'] = results
                    #with open('hist.txt','w') as data: 
                    #  data.write(str(results))

res

iteration:  0  total error: 1.117416064283452  test error: 1.1186844479878695
iteration:  1  total error: 1.0044183133720912  test error: 1.0077671193028592
iteration:  2  total error: 0.9673431245225359  test error: 0.9716933920541572
iteration:  3  total error: 0.9486056333005153  test error: 0.9538216098637887
iteration:  4  total error: 0.9372407006729486  test error: 0.9431670650932265
iteration:  5  total error: 0.9295705752174868  test error: 0.9361195078965776
iteration:  6  total error: 0.9240110349585791  test error: 0.9311181634331914
iteration:  7  total error: 0.9197914241652572  test error: 0.9273886763544991
iteration:  8  total error: 0.9164722526456707  test error: 0.9244970743196314
iteration:  9  total error: 0.9137852457148077  test error: 0.9222208950756132
iteration:  10  total error: 0.9115689179013066  test error: 0.9203736726532028
iteration:  11  total error: 0.9097044963065741  test error: 0.9188723873349955
iteration:  12  total error: 0.908118157880583  tes

  arr = np.asanyarray(arr)


iteration:  0  total error: 1.117450977398988  test error: 1.1187332816951108
iteration:  1  total error: 1.006226518718138  test error: 1.0095771143810588
iteration:  2  total error: 0.9698686645005808  test error: 0.9741806440768945
iteration:  3  total error: 0.9514229639187533  test error: 0.9564998214709785
iteration:  4  total error: 0.9401681175944254  test error: 0.9458351532739471
iteration:  5  total error: 0.9326214712109463  test error: 0.9388287235801418
iteration:  6  total error: 0.9271586158402265  test error: 0.9337856468777503
iteration:  7  total error: 0.9230427075671417  test error: 0.9300354981874224
iteration:  8  total error: 0.919813075178748  test error: 0.9271083710417191
iteration:  9  total error: 0.9172402314678199  test error: 0.9248397489289509
iteration:  10  total error: 0.9151214466512543  test error: 0.9229129416131608
iteration:  11  total error: 0.9133863794202055  test error: 0.9214000578710291
iteration:  12  total error: 0.9119189956104645  test

[0.9062739749957849,
 0.9142781057192227,
 0.9107942589496475,
 0.9171988711591215,
 0.9628008125833368,
 0.9674349504382608,
 0.8641201399557326,
 0.8629348193753759,
 0.8698317063312794,
 0.912223924754933,
 0.962621762299224,
 0.9633037558933635,
 0.9050912380938658,
 0.9138891563578717,
 0.91075655454237,
 0.9171516909186362,
 0.9627123743722543,
 0.967533594543465,
 0.8856195628517872,
 0.8845789767356579,
 0.8691940480901359,
 0.911990531956273,
 0.9632273272782985,
 0.9633000081001682,
 0.905546841878876,
 0.9145167430376201,
 0.9109013978264983,
 0.917165137611144,
 0.9623572577736225,
 0.9674485346376516,
 0.9230544908330863,
 0.9260745806125859,
 0.8694151964647019,
 0.9123736210243549,
 0.9631422122740847,
 0.9641670674822127]

In [None]:
res

[[[44.08085823059082],
  [0.9716820525536117],
  [1.1173721807763446, 1.0043863350704552],
  [1.1173721807763446, 1.0043863350704552]]]

In [None]:
from numpy import load
load(hist_file)

ValueError: ignored