In [400]:
import pandas as pd
from sklearn.model_selection import KFold
import random
random.seed(3)

In [401]:
# loading data
movies = pd.read_csv(
    'movies.dat',
    sep = "::",
    names = ['MovieID', 'Title', 'Genres'],
    encoding='latin-1',
    engine='python',
)

ratings = pd.read_csv(
    'ratings.dat',
    sep = "::",
    names = ['UserID','MovieID','Rating','Timestamp'],
    encoding='latin-1',
    engine='python',
)

users = pd.read_csv(
    'users.dat',
    sep = "::",
    names = ['UserID', 'Gender', 'Age','Occupation','Zip-code'],
    encoding='latin-1',
    engine='python',
)
display(movies.head())
display(users.head())
ratings.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## 1. Recommender System

## 1.2 UV Matrix Decomposition

Notes: To increase our chances of finding the global minimum, we need to pick many dif- ferent starting points, that is, different choices of the initial matrices U and V . However, there is never a guarantee that our best local minimum will be the global minimum.

In [402]:
import numpy as np

In [444]:
class Matrix_Decomposition:
    def __init__(self, matrix, dimensions = 5, iterations = 1):
        self.matrix = matrix
        self.n = self.matrix.shape[0]
        self.m = self.matrix.shape[1]
        self.d = dimensions
        self.normalized = self.matrix - np.nanmean(self.matrix,axis=1, keepdims=True) - np.nanmean(self.matrix,axis=0, keepdims=True)
        self.U = np.random.rand(self.n,self.d) + np.sqrt(np.nanmean(self.matrix)/self.d)
        self.V = np.random.rand(self.d,self.m) + np.sqrt(np.nanmean(self.matrix)/self.d)
        self.iter = iterations
        self.transformed_UV = np.dot(self.U, self.V)+np.nanmean(self.matrix,axis=0, keepdims=True)+np.nanmean(self.matrix,axis=1, keepdims=True)
            
    def rmse_new(self):
        UV = np.dot(self.U,self.V)
        error = np.nanmean((self.normalized - UV)**2)
        return error


    def UV(self):
        err = [self.rmse_new()]
        for i in range(self.iter):
            # decompose User matrix
            for r in range(0,self.n):
                for s in range(0,self.d):
                    m = np.array(self.normalized[r,:])
                    v = np.array(self.V[s,:])
                    u = np.array(self.U[r,:])
                    index_no_s = np.array(range(self.d)) != s
                    self.U[r,s]=np.nansum(v*(m-np.dot(u[index_no_s],self.V[index_no_s,:])))/np.sum(v**2)
            
            # decompose item matrix
            for s in range(0,self.m):
                for r in range(0,self.d):
                    m = np.array(self.normalized[:,s])
                    u = np.array(self.U[:,r])
                    v = np.array(self.V[:,s])
                    index_no_r = np.array(range(self.d)) != r
                    self.V[r,s]=np.nansum(u * (m-np.dot(self.U[:,index_no_r],v[index_no_r])))/np.sum(u**2)
                    
            if err[i] - self.rmse_new() < 0.000001 and i > self.iter/2:
                break
            
            else:
                print('Iteration: ' + str(i) + ', Error: ' + str(self.rmse_new()))
                err.append(self.rmse_new())
            
        return self.U, self.V, err

In [None]:
## Prediction of ratings
x = ratings.sample(frac = 1)
X = np.array_split(x, 5)

rmses = []
maes = []

for i in range(5):
    # Find train and test utility matrices
    X_test = X[i]
    X_train = pd.concat([X[j] for j in range(5) if j!=i])
    Y_test = X_test.copy()
    Y_train = X_train.copy()
    Y_train[['Rating']] = np.nan
    X_test = pd.concat([X_test,Y_train])
    Y_test[['Rating']] = np.nan
    X_train = pd.concat([X_train, Y_test])
    
    # Perform prediction
    Utility_DF = X_train.pivot(index = 'UserID', columns ='MovieID', values = 'Rating')
    Utility_test = X_test.pivot(index = 'UserID', columns ='MovieID', values = 'Rating')
    
    M = Utility_DF.to_numpy()
    M_test = Utility_test.to_numpy()
    model_matrix = Matrix_Decomposition(M, iterations = 90)
    uv = model_matrix.UV()
    r=np.dot(model_matrix.U, model_matrix.V)+np.nanmean(M,axis=0, keepdims=True)+np.nanmean(M,axis=1, keepdims=True)
    rmse = np.sqrt(np.nanmean((r - M_test)**2))
    rmses += [rmse]
    mae = np.nanmean(abs(r - M_test))
    maes += [mae]

  self.normalized = self.matrix - np.nanmean(self.matrix,axis=1, keepdims=True) - np.nanmean(self.matrix,axis=0, keepdims=True)
  self.transformed_UV = np.dot(self.U, self.V)+np.nanmean(self.matrix,axis=0, keepdims=True)+np.nanmean(self.matrix,axis=1, keepdims=True)


Iteration: 0, Error: 6.20520808236576
Iteration: 1, Error: 5.803083398694377
Iteration: 2, Error: 5.765303814815167
Iteration: 3, Error: 5.736555638750696
Iteration: 4, Error: 5.7180664301475295
Iteration: 5, Error: 5.70795862618895
Iteration: 6, Error: 5.7023825975588265
Iteration: 7, Error: 5.699064220608889
Iteration: 8, Error: 5.696923227423008
Iteration: 9, Error: 5.695458003805929
Iteration: 10, Error: 5.69441877601245
Iteration: 11, Error: 5.693668047255334
Iteration: 12, Error: 5.693122783208237
Iteration: 13, Error: 5.692728768349329
Iteration: 14, Error: 5.692448186085225
Iteration: 15, Error: 5.692253213834507
Iteration: 16, Error: 5.692122595372776
Iteration: 17, Error: 5.692039746353639
Iteration: 18, Error: 5.691991648901584
Iteration: 19, Error: 5.691968141871648
Iteration: 20, Error: 5.691961404318827
Iteration: 21, Error: 5.691965535896465
Iteration: 22, Error: 5.691976194486449
Iteration: 23, Error: 5.691990278422637
Iteration: 24, Error: 5.692005651129
Iteration: 25,

  r=np.dot(model_matrix.U, model_matrix.V)+np.nanmean(M,axis=0, keepdims=True)+np.nanmean(M,axis=1, keepdims=True)


Iteration: 0, Error: 6.1944682886466635
Iteration: 1, Error: 5.802577599090412
Iteration: 2, Error: 5.769153039735476
Iteration: 3, Error: 5.741052609183996
Iteration: 4, Error: 5.723029729136073
Iteration: 5, Error: 5.713053114538401


In [None]:
rmses

In [None]:
maes