In [1]:
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
# loading data
movies = pd.read_csv(
    'movies.dat',
    sep = "::",
    names = ['MovieID', 'Title', 'Genres'],
    encoding='latin-1',
    engine='python',
)

ratings = pd.read_csv(
    'ratings.dat',
    sep = "::",
    names = ['UserID','MovieID','Rating','Timestamp'],
    encoding='latin-1',
    engine='python',
)

users = pd.read_csv(
    'users.dat',
    sep = "::",
    names = ['UserID', 'Gender', 'Age','Occupation','Zip-code'],
    encoding='latin-1',
    engine='python',
)
display(movies.head())
display(users.head())
ratings.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## 1. Recommender System

## 1.2 UV Matrix Decomposition

Notes: To increase our chances of finding the global minimum, we need to pick many dif- ferent starting points, that is, different choices of the initial matrices U and V . However, there is never a guarantee that our best local minimum will be the global minimum.

In [3]:
import numpy as np

In [30]:
# creating matrix M
Utility_DF = ratings.pivot(index = 'UserID', columns ='MovieID', values = 'Rating')
M = Utility_DF.to_numpy()
M.shape
B = M - np.nanmean(M,axis=1, keepdims=True)
B = B - np.nanmean(M, axis=0, keepdims=True)
print(M)
print(B)

[[ 5. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [ 3. nan nan ... nan nan nan]]
[[-3.33552566         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 ...
 [        nan         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 [-4.72455902         nan         nan ...         nan         nan
          nan]]


In [178]:
class Matrix_Decomposition:
    def __init__(self, matrix, dimensions = 5, iterations = 1):
        self.matrix = matrix
        self.n = self.matrix.shape[0]
        self.m = self.matrix.shape[1]
        self.d = dimensions
        self.normalized = self.matrix - np.nanmean(self.matrix,axis=1, keepdims=True)
        self.normalized = self.matrix - np.nanmean(self.matrix,axis=0, keepdims=True)
        self.U = np.random.rand(self.n,self.d) + np.sqrt(np.nanmean(self.matrix)/self.d)
        self.V = np.random.rand(self.d,self.m) + np.sqrt(np.nanmean(self.matrix)/self.d)
        self.iter = iterations


        
    def rmse(self):
        e = 0.
        c = 0.
        UV = np.dot(self.U,self.V)
        for i in range(0,self.n):
            for j in range(0,self.normalized[i].shape[0]):
                e += (self.normalized[i,j]-UV[i,j])**2
                c += 1
                return np.sqrt(e/c)

            
    def rmse_new(self):
        UV = np.dot(self.U,self.V)
        error = np.nanmean((self.normalized - UV)**2)
        return error



    def UV(self):
        err = [self.rmse_new()]
        for i in range(self.iter):
            # decompose User matrix
            for r in range(0,self.n):
                for s in range(0,self.d):
                    m = np.array(self.normalized[r,:])
                    v = np.array(self.V[s,:])
                    u = np.array(self.U[r,:])
                    index_no_s = np.array(range(self.d)) != s
                    
                    # formula page 334
                    #self.U[r,s]=float(np.nansum(self.V[s,:]*(m-np.matmul(self.normalized[r,:],self.V[:])-(self.U[r,s]*self.V[s,:]))))/np.nansum(np.square(v))
                    #self.U[r,s]=float(np.nansum(v * (m-np.nansum(self.U[r,s]*v))))/np.nansum(np.square(v))
                    self.U[r,s]=np.nansum(v*(m-np.dot(u[index_no_s],self.V[index_no_s,:])))/np.sum(v**2)
            
            # decompose item matrix
            for s in range(0,self.m):
                for r in range(0,self.d):
                    m = np.array(self.normalized[:,s])
                    u = np.array(self.U[:,r])
                    v = np.array(self.V[:,s])
                    index_no_r = np.array(range(self.d)) != r
                    
                    # formula page 334
                    #self.V[r,s]=float(np.nansum(self.U[:, r] * (m - np.matmul(self.U[:], self.V[:,s]) - (self.V[r, s] * self.U[:, r]))))/np.nansum(np.square(u))
                    self.V[r,s]=np.nansum(u * (m-np.dot(self.U[:,index_no_r],v[index_no_r])))/np.sum(u**2)
                    
            if err[i] - self.rmse_new() < 0.00001 and i > self.iter/2:
                break
            
            else:
                print('Iteration: ' + str(i) + ', Error: ' + str(self.rmse_new()))
                err.append(self.rmse_new())
            
        return self.U, self.V, err

In [179]:
model_matrix = Matrix_Decomposition(M, iterations = 90)

In [None]:
uv = model_matrix.UV()

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8


In [148]:
uv

(array([[nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        ...,
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan]]),
 array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]),
 [84.71554233359859,
  2.8326146480461333e+52,
  1.1243418249457061e+86,
  1.109382800383537e+113,
  9.466890287363476e+136,
  4.925079726458975e+158,
  7.627193967084792e+176,
  6.531758382230628e+197,
  6.865794118551444e+218,
  8.619547013989027e+239,
  1.2711732142467206e+261,
  2.1467922867911156e+282,
  inf,
  inf,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
 