In [1]:
import math
import time
import numpy as np
from numpy import genfromtxt,savetxt,loadtxt

np.set_printoptions(suppress=True)
np.random.seed(0)

In [2]:
k = 7
alpha = 0.07
lamb_u = 0.15
lamb_i = 0.01
iterations = 20

In [3]:
#import data
train_set = genfromtxt('Train.csv', delimiter=',')
train_set = np.delete(train_set, (0), axis=0)
valid_set = genfromtxt('Validation.csv', delimiter=',')
valid_set = np.delete(valid_set, (0), axis=0)

In [4]:
#import data
users_table = genfromtxt('Users Demographics.csv', delimiter=',')
users_table = np.delete(users_table, (0), axis=0)
users = users_table[:,0]
movies_table = genfromtxt('Movies Metadata.csv', delimiter=',')
movies_table = np.delete(movies_table, (0), axis=0)
items = movies_table[:,0]

In [5]:
#initialize the biases
# b_u = np.zeros(users.size) #user bias
# b_i = np.zeros(items.size) #item bias
b_u = loadtxt('bu.csv', delimiter=',')
b_i = loadtxt('bi.csv', delimiter=',')


b = np.mean(train_set[:,2]) #global bias

In [6]:
def create_P_Q(k):
    #initialize user(P) and item(Q) feature matrix
#     P = np.array(np.random.uniform(0, 1, users.shape[0] * k).reshape(users.shape[0], k))
#     Q = np.array(np.random.uniform(0, 1, items.shape[0] * k).reshape(k, items.shape[0]))
    
    P = loadtxt('p.csv', delimiter=',')
    Q = loadtxt('q.csv', delimiter=',')
    return P,Q

In [7]:
def mae(predictions, targets):
    return sum(abs(targets - predictions))/ targets.shape[0]

In [8]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [9]:
def r_squre(predictions, targets):
    SSres = sum((targets - predictions)** 2)
    SStotal = sum((targets - np.mean(targets))** 2)
    return 1 - (SSres/SStotal)

In [10]:
def train(train_set,P,Q,alpha,lamb_u,lamb_i,iterations, valid_set):
    flag = 0
    rmse_old = 0 
    RMSE = 0
    for i in range(iterations) :
        P,Q, prediction,b_u,b_i = SGD(train_set, P, Q, alpha, lamb_u,lamb_i)
        targets = train_set[:,2]
        RMSE_T = rmse(prediction,targets)
        print(f"iteration {i} : error in train is: {RMSE_T}")
        
        #test for overfitting
        p = predict(valid_set[:,[0,1]], P,Q,b_u,b_i)
        targets = valid_set[:,2]
        rmse_old = RMSE
        RMSE =rmse(p,targets)
        print(f"iteration {i} : error in test is: {RMSE}")
        if RMSE > rmse_old and i>1 :
            flag = flag + 1
        if flag==2:
            break
        
        #update alpha       
        alpha = 0.9*alpha 
        
    return P,Q,b_u,b_i

In [11]:
def SGD(train_set, P, Q, alpha, lamb_u,lamb_i):
    #stochastic gradient descent
    predictions = []
    for i,j,r in train_set:
        error = 0
        i = int(i)
        j = int(j)
        
        prediction = b + b_u[i-1] + b_i[j-1] + P[i-1,:].dot(Q[:,j-1])
        predictions.append(prediction)
        error = (r - prediction)

        #update biases
        b_u[i-1] = b_u[i-1] + alpha*(error-lamb_u*b_u[i-1])
        b_i[j-1] = b_i[j-1] + alpha*(error-lamb_i*b_i[j-1])
        
        #update matrrix
        p_new = P[i-1,:] + alpha*(error*Q[:,j-1]-lamb_u*P[i-1,:])
        q_new = Q[:,j-1] + alpha*(error*P[i-1,:]-lamb_i*Q[:,j-1])
        P[i-1,:] = p_new[:]
        Q[:,j-1] = q_new[:]       
  
    return P,Q,predictions,b_u,b_i


In [12]:
def predict(data, P,Q,b_u,b_i): 
# the function get a set of pairs (user,item), a model (P,Q,b_u,b_i) and predict the rating
    predictions = []
    for i,j in data: 
        i=int(i)
        j=int(j)
        prediction = b + b_u[i-1] + b_i[j-1] + P[i-1,:].dot(Q[:,j-1])
        predictions.append(prediction)
        
    return predictions

In [13]:
def grid_search(train_set,valid_set,ks,alphas,lambs_u,lambs_i,iterations):
    score = np.inf
    counter = 1
    best = [0,0,0,0]
    for k in ks:
        for alpha in alphas:
            for lamb_u in lambs_u:
                for lumb_i in lambs_i:
                    start_time = time.time()
                    p,q = create_P_Q(k)
                    print(f"{counter}: params: [k:{k}, alpha:{alpha}, lamb_u:{lamb_u}, lamb_i:{lamb_i}]")
                    targets = valid_set[:,2]
                    p,q,b_u,b_i = train(train_set,p,q,alpha,lamb_u,lamb_i,iterations,valid_set)
                    predictions = predict(valid_set[:,[0,1]],p,q,b_u,b_i)
                    RMSE = rmse(predictions,targets)
                    if RMSE < score:
                        score = RMSE
                        best = [k,alpha,lamb_u,lamb_i]
                        savetxt('bi.csv', b_i, delimiter=',') #save tha best model
                        savetxt('bu.csv', b_u, delimiter=',') #save tha best model
                        savetxt('p.csv', p, delimiter=',') #save tha best model
                        savetxt('q.csv', q, delimiter=',') #save tha best model

                    print(f"Best params: {best}, with rmse = {score}\n")
                    counter += 1
    return best,RMSE

In [16]:
best_params,rmse = grid_search(train_set,valid_set,ks,alphas,lambs_u,lambs_i,80)

In [13]:
#main

# P,Q=create_P_Q(k)
# P,Q,b_u,b_i = train(train_set,P,Q,alpha,lamb_u,lamb_i,iterations, valid_set)

prediction = predict(valid_set[:,[0,1]], P,Q,b_u,b_i)
targets = valid_set[:,2]

#calc RMSE,MAE,R^2
MAE = mae(prediction,targets)
RMSE =rmse(prediction,targets)
R_SQURED = r_squre(prediction, targets)
print(f"RMSE: {RMSE} ,MAE: {MAE}, R_SQURED: {R_SQURED}")

0.6965028040077988
0.8955580061590693
0.35500416783787747


In [14]:
ks=[7]
alphas = [0.07]
lambs_u = [0.15]
lambs_i = [0.01]

In [None]:

# Best params: [9, 0.05, 0.05], with rmse = 0.90765267392406
# Best params: [9, 0.01, 0.05], with rmse = 0.9022141593766955
# Best params: [9, 0.05, 0.1, 0.01], with rmse = 0.901681924572154
# Best params: [7, 0.07, 0.1, 0.01], with rmse = 0.8968920688922477
# Best params: [7, 0.09, 0.1, 0.01], with rmse = 0.8993324660380766
# Best params: [8, 0.07, 0.1, 0.01], with rmse = 0.8989113015264709
# Best params: [7, 0.07, 0.15, 0.01], with rmse = 0.8955580061590693

In [29]:
savetxt('bi.csv', b_i, delimiter=',')
savetxt('bu.csv', b_u, delimiter=',')
savetxt('p.csv', P, delimiter=',')
savetxt('q.csv', Q, delimiter=',')

NameError: name 'P' is not defined