In [191]:
import pandas as pd
from sklearn.model_selection import KFold
import random
random.seed(2792729)

In [2]:
# loading data
movies = pd.read_csv(
    'movies.dat',
    sep = "::",
    names = ['MovieID', 'Title', 'Genres'],
    encoding='latin-1',
    engine='python',
)

ratings = pd.read_csv(
    'ratings.dat',
    sep = "::",
    names = ['UserID','MovieID','Rating','Timestamp'],
    encoding='latin-1',
    engine='python',
)

users = pd.read_csv(
    'users.dat',
    sep = "::",
    names = ['UserID', 'Gender', 'Age','Occupation','Zip-code'],
    encoding='latin-1',
    engine='python',
)
display(movies.head())
display(users.head())
ratings.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## 1. Recommender System

## 1.2 UV Matrix Decomposition

Notes: To increase our chances of finding the global minimum, we need to pick many dif- ferent starting points, that is, different choices of the initial matrices U and V . However, there is never a guarantee that our best local minimum will be the global minimum.

In [3]:
import numpy as np

In [30]:
# creating matrix M
Utility_DF = ratings.pivot(index = 'UserID', columns ='MovieID', values = 'Rating')
M = Utility_DF.to_numpy()
M.shape
B = M - np.nanmean(M,axis=1, keepdims=True)
B = B - np.nanmean(M, axis=0, keepdims=True)
print(M)
print(B)

[[ 5. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [ 3. nan nan ... nan nan nan]]
[[-3.33552566         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 ...
 [        nan         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 [-4.72455902         nan         nan ...         nan         nan
          nan]]


In [185]:
class Matrix_Decomposition:
    def __init__(self, matrix, dimensions = 5, iterations = 1):
        self.matrix = matrix
        self.n = self.matrix.shape[0]
        self.m = self.matrix.shape[1]
        self.d = dimensions
        self.normalized = self.matrix - np.nanmean(self.matrix,axis=1, keepdims=True)
        self.normalized = self.matrix - np.nanmean(self.matrix,axis=0, keepdims=True)
        self.U = np.random.rand(self.n,self.d) + np.sqrt(np.nanmean(self.matrix)/self.d)
        self.V = np.random.rand(self.d,self.m) + np.sqrt(np.nanmean(self.matrix)/self.d)
        self.iter = iterations


        
    def rmse(self):
        e = 0.
        c = 0.
        UV = np.dot(self.U,self.V)
        for i in range(0,self.n):
            for j in range(0,self.normalized[i].shape[0]):
                e += (self.normalized[i,j]-UV[i,j])**2
                c += 1
                return np.sqrt(e/c)

            
    def rmse_new(self):
        UV = np.dot(self.U,self.V)
        error = np.nanmean((self.normalized - UV)**2)
        return error



    def UV(self):
        err = [self.rmse_new()]
        for i in range(self.iter):
            # decompose User matrix
            for r in range(0,self.n):
                for s in range(0,self.d):
                    m = np.array(self.normalized[r,:])
                    v = np.array(self.V[s,:])
                    u = np.array(self.U[r,:])
                    index_no_s = np.array(range(self.d)) != s
                    
                    # formula page 334
                    #self.U[r,s]=float(np.nansum(self.V[s,:]*(m-np.matmul(self.normalized[r,:],self.V[:])-(self.U[r,s]*self.V[s,:]))))/np.nansum(np.square(v))
                    #self.U[r,s]=float(np.nansum(v * (m-np.nansum(self.U[r,s]*v))))/np.nansum(np.square(v))
                    self.U[r,s]=np.nansum(v*(m-np.dot(u[index_no_s],self.V[index_no_s,:])))/np.sum(v**2)
            
            # decompose item matrix
            for s in range(0,self.m):
                for r in range(0,self.d):
                    m = np.array(self.normalized[:,s])
                    u = np.array(self.U[:,r])
                    v = np.array(self.V[:,s])
                    index_no_r = np.array(range(self.d)) != r
                    
                    # formula page 334
                    #self.V[r,s]=float(np.nansum(self.U[:, r] * (m - np.matmul(self.U[:], self.V[:,s]) - (self.V[r, s] * self.U[:, r]))))/np.nansum(np.square(u))
                    self.V[r,s]=np.nansum(u * (m-np.dot(self.U[:,index_no_r],v[index_no_r])))/np.sum(u**2)
                    
            if err[i] - self.rmse_new() < 0.000001 and i > self.iter/2:
                break
            
            else:
                print('Iteration: ' + str(i) + ', Error: ' + str(self.rmse_new()))
                err.append(self.rmse_new())
            
        return self.U, self.V, err

In [186]:
model_matrix = Matrix_Decomposition(M, iterations = 90)

In [187]:
uv = model_matrix.UV()

Iteration: 0, Error: 0.9506667187610622
Iteration: 1, Error: 0.8592411402641396
Iteration: 2, Error: 0.8464151293210905
Iteration: 3, Error: 0.8404268546208787
Iteration: 4, Error: 0.8356802252477904
Iteration: 5, Error: 0.8335272241568384
Iteration: 6, Error: 0.8326786371736342
Iteration: 7, Error: 0.8321448134220549
Iteration: 8, Error: 0.8316615206753322
Iteration: 9, Error: 0.831165505581883
Iteration: 10, Error: 0.830663670938501
Iteration: 11, Error: 0.8301925354693521
Iteration: 12, Error: 0.8297842308189639
Iteration: 13, Error: 0.8294494552938619
Iteration: 14, Error: 0.8291824983993548
Iteration: 15, Error: 0.8289715201040979
Iteration: 16, Error: 0.8288043958035306
Iteration: 17, Error: 0.8286708364150626
Iteration: 18, Error: 0.828562846606183
Iteration: 19, Error: 0.8284744920336005
Iteration: 20, Error: 0.82840142756367
Iteration: 21, Error: 0.8283404359622958
Iteration: 22, Error: 0.8282890782049115
Iteration: 23, Error: 0.8282454606283505
Iteration: 24, Error: 0.8282080

In [189]:
uv

(array([[-3.50651567e-01,  1.34590165e-02,  4.34103148e-03,
         -5.76404932e-03,  1.05883298e-02],
        [-2.27240080e-01,  6.60161954e-02, -1.88792639e-02,
          8.56452918e-03, -1.81115692e-02],
        [-6.91900758e-02,  4.02810602e-02,  3.46987897e-02,
          2.43781072e-03, -3.72233512e-04],
        ...,
        [-8.51004538e-03, -1.46125512e-02,  8.97978987e-03,
          2.34967437e-03, -2.26667036e-02],
        [-6.15397458e-02,  2.51625407e-02, -1.23079140e-02,
          5.11539799e-03, -1.70951105e-02],
        [ 6.85370413e-01, -3.09952557e-01, -7.10873997e-02,
          4.41273790e-02, -1.23621806e-01]]),
 array([[-2.97002419e-01, -1.16199495e-01, -7.91236184e-02, ...,
         -5.44935704e-03, -4.50653412e-03, -3.09059804e-02],
        [ 2.30587986e-01,  2.60917328e-01,  1.61373326e-01, ...,
          9.20952261e-03, -2.42528435e-04,  5.55413325e-02],
        [-8.25783218e-01,  5.64943512e-01,  4.79508663e-01, ...,
          1.53973575e-02,  2.45351026e-02,  