In [1]:
import math
import time
import numpy as np
from numpy import genfromtxt,savetxt,loadtxt

np.set_printoptions(suppress=True)
np.random.seed(0)

In [2]:
k = 2
alpha = 0.05
lamb = 0.01
iterations = 500

In [3]:
#import data
train_set = genfromtxt('Train.csv', delimiter=',')
train_set = np.delete(train_set, (0), axis=0)
valid_set = genfromtxt('Validation.csv', delimiter=',')
valid_set = np.delete(valid_set, (0), axis=0)

In [4]:
#import data
users_table = genfromtxt('Users Demographics.csv', delimiter=',')
users_table = np.delete(users_table, (0), axis=0)
users = users_table[:,0]
movies_table = genfromtxt('Movies Metadata.csv', delimiter=',')
movies_table = np.delete(movies_table, (0), axis=0)
items = movies_table[:,0]

In [5]:
#initialize the biases
b_u = np.zeros(users.size) #user bias
b_i = np.zeros(items.size) #item bias
b = np.mean(train_set[:,2]) #global bias

In [6]:
def create_P_Q(k):
    #initialize user(P) and item(Q) feature matrix
    P = np.array(np.random.uniform(0, 1, users.shape[0] * k).reshape(users.shape[0], k))
    Q = np.array(np.random.uniform(0, 1, items.shape[0] * k).reshape(k, items.shape[0]))
#     P = loadtxt('p.csv', delimiter=',')
#     Q = loadtxt('q.csv', delimiter=',')
    return P,Q

In [7]:
def mae(predictions, targets):
    return sum(abs(targets - predictions))/ targets.shape[0]

In [8]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [9]:
def r_squre(predictions, targets):
    SSres = sum((targets - predictions)** 2)
    SStotal = sum((targets - np.mean(targets))** 2)
    return 1 - SSres/SStotal

In [10]:
def train(train_set,P,Q,alpha,lamb,iterations):
    for i in range(iterations) :
        P,Q, prediction,b_u,b_i = SGD(train_set, P, Q, alpha, lamb)
        targets = train_set[:,2]
        MAE = mae(prediction,targets)
        print(f"iteration {i} : error is: {MAE}")
        alpha = 0.95*alpha #update alpha
        
    return P,Q,b_u,b_i
    

In [11]:
def SGD(train_set, P, Q, alpha, lamb):
    #stochastic gradient descent
    predictions = []
    for i,j,r in train_set:
        error = 0
        i = int(i)
        j = int(j)
        
        prediction = b + b_u[i-1] + b_i[j-1] + P[i-1,:].dot(Q[:,j-1])
        predictions.append(prediction)
        error = (r - prediction)
#         print(error)
#         print(f"{i},{j}    r: {r}    pred {prediction}    err: {error}")
        
#         if math.isinf(b_u[i-1]):
#         print(f"{i},{j}    {b_u[i-1]}")

        #update biases
        b_u[i-1] = b_u[i-1] + alpha*(error-lamb*b_u[i-1])
        b_i[j-1] = b_i[j-1] + alpha*(error-lamb*b_i[j-1])
        
         #update matrrix
#         if i == 1333:
#             print(f"{i},{j} prediction:{prediction} Q: {Q[:,j-1]} P: {lamb*P[i-1,:]} err: {error} Bu: {b_u[i-1]} Bi: {b_i[j-1]}")
        p_new = P[i-1,:] + alpha*(error*Q[:,j-1]-lamb*P[i-1,:])
        q_new = Q[:,j-1] + alpha*(error*P[i-1,:]-lamb*Q[:,j-1])
        P[i-1,:] = p_new[:]
        Q[:,j-1] = q_new[:]
#         print(p_new)
#         print(error)
    
    return P,Q,predictions,b_u,b_i


In [12]:
def predict(data, P,Q,b_u,b_i):
    predictions = []
    for i,j in data: 
        i=int(i)
        j=int(j)
        prediction = b + b_u[i-1] + b_i[j-1] + P[i-1,:].dot(Q[:,j-1])
        predictions.append(prediction)
        
    return predictions

In [None]:
#main
P,Q=create_P_Q(k)
P,Q,b_u,b_i = train(train_set,P,Q,alpha,lamb,iterations)
prediction = predict(valid_set[:,[0,1]], P,Q,b_u,b_i)
targets = valid_set[:,2]
MAE = mae(prediction,targets)
RMSE =rmse(prediction,targets)
R_SQURED = r_squre(prediction, targets)
print(MAE)
print(RMSE)
print(R_SQURED)

In [19]:
def grid_search(train_set,valid_set,ks,alphas,lambs,iterations):
    score = np.inf
    counter = 1
    best = [0,0,0]
    for k in ks:
        p,q = create_P_Q(k)
        for alpha in alphas:
            for lamb in lambs:
                start_time = time.time()
                print(f"{counter}: params: [k:{k}, alpha:{alpha}, lamb:{lamb}]")
                targets = valid_set[:,2]
                p,q,b_u,b_i = train(train_set,p,q,alpha,lamb,iterations)
                predictions = predict(valid_set[:,[0,1]],p,q,b_u,b_i)
                RMSE = rmse(predictions,targets)
                if RMSE < score:
                    score = RMSE
                    best = [k,alpha,lamb]
#                 print(f"\ncombination {counter}: {k,alpha,lamb} took {time.time() - start_time} seconds and got RMSE = {RMSE}")
                print(f"Best params: {best}, with rmse = {score}\n")
                counter += 1
    return best,RMSE

In [20]:
ks = list(range(1,20,4))

alphas = list(range(1,26,5))
alphas = [alpha/100 for alpha in alphas]

lambs = list(range(1,26,5))
lambs = [lamb/100 for lamb in lambs]

In [21]:
best_params,rmse = grid_search(train_set,valid_set,ks,alphas,lambs,15)

1: params: [k:1, alpha:0.01, lamb:0.01]
iteration 0 : error is: 0.7254474094610038
iteration 1 : error is: 0.7169296734722818
iteration 2 : error is: 0.7141661891541999
iteration 3 : error is: 0.7125652570551706
iteration 4 : error is: 0.7111937303384993
iteration 5 : error is: 0.7096935506918437
iteration 6 : error is: 0.707894552238018
iteration 7 : error is: 0.7057954920183773
iteration 8 : error is: 0.7035154340047292
iteration 9 : error is: 0.7012233721345653
iteration 10 : error is: 0.6990351880134709
iteration 11 : error is: 0.6970372057752856
iteration 12 : error is: 0.6952538226492966
iteration 13 : error is: 0.6936877485178269
iteration 14 : error is: 0.692320464464612
Best params: [1, 0.01, 0.01], with rmse = 0.9480010880938748

2: params: [k:1, alpha:0.01, lamb:0.06]
iteration 0 : error is: 0.6952115886129898
iteration 1 : error is: 0.6955076254867342
iteration 2 : error is: 0.6945789777594985
iteration 3 : error is: 0.693621934780254
iteration 4 : error is: 0.6927559182703

iteration 0 : error is: 0.7292029283001071
iteration 1 : error is: 0.7283089266493774
iteration 2 : error is: 0.7260895472829403
iteration 3 : error is: 0.7239142263698871
iteration 4 : error is: 0.7218406044358034
iteration 5 : error is: 0.7198705579773318
iteration 6 : error is: 0.7180052141107827
iteration 7 : error is: 0.7162420837839549
iteration 8 : error is: 0.7145752681823393
iteration 9 : error is: 0.7130001027001664
iteration 10 : error is: 0.7115103113676754
iteration 11 : error is: 0.7101004320680531
iteration 12 : error is: 0.708766289577919
iteration 13 : error is: 0.7075018623637821
iteration 14 : error is: 0.7063031109547361
Best params: [1, 0.01, 0.11], with rmse = 0.9373998966122565

13: params: [k:1, alpha:0.11, lamb:0.11]
iteration 0 : error is: 0.7320940561791737
iteration 1 : error is: 0.7318304521965963
iteration 2 : error is: 0.7299333578884543
iteration 3 : error is: 0.7280216393157889
iteration 4 : error is: 0.7261843647162213
iteration 5 : error is: 0.7244371

iteration 0 : error is: 0.778029548477687
iteration 1 : error is: 0.7745397771776754
iteration 2 : error is: 0.7698324852092284
iteration 3 : error is: 0.7653567972422497
iteration 4 : error is: 0.7611642546794974
iteration 5 : error is: 0.7572443239842168
iteration 6 : error is: 0.7535789294712268
iteration 7 : error is: 0.7501450483931557
iteration 8 : error is: 0.7469321237665337
iteration 9 : error is: 0.743919828319599
iteration 10 : error is: 0.741095120601463
iteration 11 : error is: 0.7384468666012682
iteration 12 : error is: 0.7359604402053772
iteration 13 : error is: 0.733625574101655
iteration 14 : error is: 0.7314290916974623
Best params: [1, 0.01, 0.11], with rmse = 0.9373998966122565

24: params: [k:1, alpha:0.21, lamb:0.16]
iteration 0 : error is: 0.7785063740668
iteration 1 : error is: 0.7757661148356865
iteration 2 : error is: 0.7716937865501922
iteration 3 : error is: 0.7677692919989295
iteration 4 : error is: 0.7640758589558322
iteration 5 : error is: 0.7606130455504

iteration 0 : error is: 0.7075025817237521
iteration 1 : error is: 0.7126122188378912
iteration 2 : error is: 0.7135548752470864
iteration 3 : error is: 0.7136116136109257
iteration 4 : error is: 0.7133335050182137
iteration 5 : error is: 0.7129000448152255
iteration 6 : error is: 0.7123862650936708
iteration 7 : error is: 0.7118329534767981
iteration 8 : error is: 0.7112616672049981
iteration 9 : error is: 0.7106847805471345
iteration 10 : error is: 0.7101120432723208
iteration 11 : error is: 0.709547997222266
iteration 12 : error is: 0.7089966828812058
iteration 13 : error is: 0.7084586166141102
iteration 14 : error is: 0.7079355886085213
Best params: [5, 0.01, 0.06], with rmse = 0.9190883578003384

35: params: [k:5, alpha:0.06, lamb:0.21]
iteration 0 : error is: 0.7246941345699824
iteration 1 : error is: 0.7276116425480945
iteration 2 : error is: 0.7280046074633916
iteration 3 : error is: 0.727854159777633
iteration 4 : error is: 0.7275000379702191
iteration 5 : error is: 0.72705387

iteration 0 : error is: 0.7609103910610008
iteration 1 : error is: 0.7607043131793613
iteration 2 : error is: 0.7587053313127933
iteration 3 : error is: 0.7565422794918143
iteration 4 : error is: 0.7544105620170201
iteration 5 : error is: 0.7523590892751119
iteration 6 : error is: 0.7504020252261555
iteration 7 : error is: 0.7485424188860806
iteration 8 : error is: 0.746779595729588
iteration 9 : error is: 0.745110281150868
iteration 10 : error is: 0.743530432456172
iteration 11 : error is: 0.7420332713269165
iteration 12 : error is: 0.7406154142255084
iteration 13 : error is: 0.7392749374969367
iteration 14 : error is: 0.7380061981619095
Best params: [5, 0.01, 0.06], with rmse = 0.9190883578003384

46: params: [k:5, alpha:0.21, lamb:0.01]
iteration 0 : error is: 0.8046281432564178
iteration 1 : error is: 0.8030357237507492
iteration 2 : error is: 0.7932714212089014
iteration 3 : error is: 0.7836521498848522
iteration 4 : error is: 0.7747554111366289
iteration 5 : error is: 0.766456167

iteration 0 : error is: 0.6914730078559487
iteration 1 : error is: 0.6703929015487656
iteration 2 : error is: 0.6613548267662678
iteration 3 : error is: 0.6558884932041542
iteration 4 : error is: 0.6517121648038972
iteration 5 : error is: 0.6481919768604204
iteration 6 : error is: 0.6450868475579428
iteration 7 : error is: 0.6422806786832964
iteration 8 : error is: 0.6397041089514657
iteration 9 : error is: 0.6373096503656811
iteration 10 : error is: 0.6350703203585714
iteration 11 : error is: 0.6329638431578908
iteration 12 : error is: 0.6309771015901151
iteration 13 : error is: 0.6290980856290619
iteration 14 : error is: 0.6273178527796043
Best params: [9, 0.01, 0.06], with rmse = 0.9145193965123494

57: params: [k:9, alpha:0.06, lamb:0.06]
iteration 0 : error is: 0.6575962221604569
iteration 1 : error is: 0.6617574164614628
iteration 2 : error is: 0.6610208066985795
iteration 3 : error is: 0.6597179093150991
iteration 4 : error is: 0.6582277796435334
iteration 5 : error is: 0.656679

iteration 0 : error is: 0.7409194698548743
iteration 1 : error is: 0.738062556544768
iteration 2 : error is: 0.7323563184660972
iteration 3 : error is: 0.7269979319608955
iteration 4 : error is: 0.721924811422143
iteration 5 : error is: 0.7170965912827837
iteration 6 : error is: 0.7125067519190613
iteration 7 : error is: 0.7081347334614977
iteration 8 : error is: 0.7039725219072073
iteration 9 : error is: 0.7000202598921974
iteration 10 : error is: 0.6962683645426602
iteration 11 : error is: 0.692704680500819
iteration 12 : error is: 0.6893230375676578
iteration 13 : error is: 0.6861081728682458
iteration 14 : error is: 0.6830468356512858
Best params: [9, 0.06, 0.06], with rmse = 0.9128193801441623

68: params: [k:9, alpha:0.16, lamb:0.11]
iteration 0 : error is: 0.7383941388853382
iteration 1 : error is: 0.7401651247427348
iteration 2 : error is: 0.7375129872852345
iteration 3 : error is: 0.7344958856790066
iteration 4 : error is: 0.73144994570754
iteration 5 : error is: 0.72847592462

  if __name__ == '__main__':


iteration 0 : error is: nan
iteration 1 : error is: nan
iteration 2 : error is: nan
iteration 3 : error is: nan
iteration 4 : error is: nan
iteration 5 : error is: nan
iteration 6 : error is: nan
iteration 7 : error is: nan
iteration 8 : error is: nan
iteration 9 : error is: nan
iteration 10 : error is: nan
iteration 11 : error is: nan
iteration 12 : error is: nan
iteration 13 : error is: nan
iteration 14 : error is: nan
Best params: [9, 0.06, 0.06], with rmse = 0.9128193801441623

73: params: [k:9, alpha:0.21, lamb:0.11]
iteration 0 : error is: nan
iteration 1 : error is: nan
iteration 2 : error is: nan
iteration 3 : error is: nan
iteration 4 : error is: nan
iteration 5 : error is: nan
iteration 6 : error is: nan
iteration 7 : error is: nan
iteration 8 : error is: nan
iteration 9 : error is: nan
iteration 10 : error is: nan
iteration 11 : error is: nan
iteration 12 : error is: nan
iteration 13 : error is: nan
iteration 14 : error is: nan
Best params: [9, 0.06, 0.06], with rmse = 0.912

iteration 10 : error is: nan
iteration 11 : error is: nan
iteration 12 : error is: nan
iteration 13 : error is: nan
iteration 14 : error is: nan
Best params: [9, 0.06, 0.06], with rmse = 0.9128193801441623

88: params: [k:13, alpha:0.11, lamb:0.11]
iteration 0 : error is: nan
iteration 1 : error is: nan
iteration 2 : error is: nan
iteration 3 : error is: nan
iteration 4 : error is: nan
iteration 5 : error is: nan
iteration 6 : error is: nan
iteration 7 : error is: nan
iteration 8 : error is: nan
iteration 9 : error is: nan
iteration 10 : error is: nan
iteration 11 : error is: nan
iteration 12 : error is: nan
iteration 13 : error is: nan
iteration 14 : error is: nan
Best params: [9, 0.06, 0.06], with rmse = 0.9128193801441623

89: params: [k:13, alpha:0.11, lamb:0.16]
iteration 0 : error is: nan
iteration 1 : error is: nan
iteration 2 : error is: nan
iteration 3 : error is: nan
iteration 4 : error is: nan
iteration 5 : error is: nan
iteration 6 : error is: nan
iteration 7 : error is: na

iteration 1 : error is: nan
iteration 2 : error is: nan
iteration 3 : error is: nan
iteration 4 : error is: nan
iteration 5 : error is: nan
iteration 6 : error is: nan
iteration 7 : error is: nan
iteration 8 : error is: nan
iteration 9 : error is: nan
iteration 10 : error is: nan
iteration 11 : error is: nan
iteration 12 : error is: nan
iteration 13 : error is: nan
iteration 14 : error is: nan
Best params: [9, 0.06, 0.06], with rmse = 0.9128193801441623

104: params: [k:17, alpha:0.01, lamb:0.16]
iteration 0 : error is: nan
iteration 1 : error is: nan
iteration 2 : error is: nan
iteration 3 : error is: nan
iteration 4 : error is: nan
iteration 5 : error is: nan
iteration 6 : error is: nan
iteration 7 : error is: nan
iteration 8 : error is: nan
iteration 9 : error is: nan
iteration 10 : error is: nan
iteration 11 : error is: nan
iteration 12 : error is: nan
iteration 13 : error is: nan
iteration 14 : error is: nan
Best params: [9, 0.06, 0.06], with rmse = 0.9128193801441623

105: params

iteration 10 : error is: nan
iteration 11 : error is: nan
iteration 12 : error is: nan
iteration 13 : error is: nan
iteration 14 : error is: nan
Best params: [9, 0.06, 0.06], with rmse = 0.9128193801441623

119: params: [k:17, alpha:0.16, lamb:0.16]
iteration 0 : error is: nan
iteration 1 : error is: nan
iteration 2 : error is: nan
iteration 3 : error is: nan
iteration 4 : error is: nan
iteration 5 : error is: nan
iteration 6 : error is: nan
iteration 7 : error is: nan
iteration 8 : error is: nan
iteration 9 : error is: nan
iteration 10 : error is: nan
iteration 11 : error is: nan
iteration 12 : error is: nan
iteration 13 : error is: nan
iteration 14 : error is: nan
Best params: [9, 0.06, 0.06], with rmse = 0.9128193801441623

120: params: [k:17, alpha:0.16, lamb:0.21]
iteration 0 : error is: nan
iteration 1 : error is: nan
iteration 2 : error is: nan
iteration 3 : error is: nan
iteration 4 : error is: nan
iteration 5 : error is: nan
iteration 6 : error is: nan
iteration 7 : error is: 

In [None]:
# best [9, 0.06, 0.06]

In [None]:
savetxt('bi.csv', b_i, delimiter=',')
savetxt('bu.csv', b_u, delimiter=',')
savetxt('p.csv', P, delimiter=',')
savetxt('q.csv', Q, delimiter=',')