In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Project 3 - Matrix Factorization

In [2]:
# Singular-value decomposition
from numpy import array
from scipy.linalg import svd

# define a matrix
A = array([[1,2], [3,4], [5,6]])
print(A)

#SVD
U, s, VT = svd(A)
print(U)
print(s)
print(VT)

[[1 2]
 [3 4]
 [5 6]]
[[-0.2298477   0.88346102  0.40824829]
 [-0.52474482  0.24078249 -0.81649658]
 [-0.81964194 -0.40189603  0.40824829]]
[9.52551809 0.51430058]
[[-0.61962948 -0.78489445]
 [-0.78489445  0.61962948]]


In [3]:
# Reconstruct SVD
from numpy import diag
from numpy import dot
from numpy import zeros

print(A)
# define a matrix
Sigma = zeros((A.shape[0], A.shape[1]))
# populate Sigma with n x n diagonal matrix
Sigma[:A.shape[1], :A.shape[1]] = diag(s)
# reconstruct matrix
B = U.dot(Sigma.dot(VT))
print(B)

[[1 2]
 [3 4]
 [5 6]]
[[1. 2.]
 [3. 4.]
 [5. 6.]]


In [4]:
A = array([[1,2,3], [4,5,6], [7,8,9]])
print(A)
# Singular value decomposition
U, s, VT = svd(A)
# create n x n Sigma matrix
Sigma = diag(s)
# reconstruct matrix
B = U.dot(Sigma.dot(VT))
print(B)

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]


Since the MovieLens dataset deals with implicit data, a very sparse matrix, Singular-Value Decomposition is not the most effective method. SVD just assigns a predicted rating by imputation to missing data, but there is more information behind that missing data when the data is implicit. The user could love the item but not know about it. In this case Alternating Least Squares (ALS) is used, which is an interative process which tries to get closer and closer to a factorized representation of the original data.

ALS essentially is fitting a line, taking the sum of squares, and iterating over and over trying to minimize this value.

# SVD

In [5]:
# ratings = pd.read_csv('https://raw.githubusercontent.com/mjdacs/data612/master/project_2/ml-latest-small/ratings.csv')
# movies = pd.read_csv('https://raw.githubusercontent.com/mjdacs/data612/master/project_2/ml-latest-small/movies.csv', index_col='movieId')
# tags = pd.read_csv('https://raw.githubusercontent.com/mjdacs/data612/master/project_2/ml-latest-small/tags.csv')

ratings = pd.read_csv('C:\\Users\\1239783\\Python\\data612-master\\project_2\\ml-latest-small\\ratings.csv')
movies = pd.read_csv('C:\\Users\\1239783\\Python\\data612-master\\project_2\\ml-latest-small\\movies.csv', index_col='movieId')
tags = pd.read_csv('C:\\Users\\1239783\\Python\\data612-master\\project_2\\ml-latest-small\\tags.csv')

In [6]:
ratings.rating.mean()

3.501556983616962

In [7]:
ratings_df = ratings.pivot(index='userId', columns='movieId', values='rating')
ratings_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


### Training and Test set creation 

In [8]:
train, test = train_test_split(ratings_df, test_size=0.2)

In [9]:
df_train = pd.melt(train.reset_index(), id_vars='userId')

In [10]:
df_train.head(10)

Unnamed: 0,userId,movieId,value
0,21,1,3.5
1,583,1,
2,398,1,
3,459,1,
4,343,1,
5,364,1,5.0
6,331,1,
7,344,1,
8,479,1,
9,234,1,5.0


In [11]:
train_mean = df_train.value.mean()
train_mean

3.513581558068345

In [12]:
df_test = pd.melt(test.reset_index(), id_vars='userId')

In [13]:
df_test.head(10)

Unnamed: 0,userId,movieId,value
0,359,1,4.0
1,433,1,
2,384,1,
3,133,1,
4,26,1,
5,42,1,
6,14,1,
7,237,1,
8,419,1,
9,28,1,


In [14]:
df_test = df_test.assign(training_mean=train_mean)
df_test.head()

Unnamed: 0,userId,movieId,value,training_mean
0,359,1,4.0,3.513582
1,433,1,,3.513582
2,384,1,,3.513582
3,133,1,,3.513582
4,26,1,,3.513582


In [15]:
df_test.shape

(1186328, 4)

In [16]:
df_final_test = df_test[np.isfinite(df_test.value)]
df_final_test.head()

Unnamed: 0,userId,movieId,value,training_mean
0,359,1,4.0,3.513582
10,559,1,5.0,3.513582
15,82,1,2.5,3.513582
16,456,1,5.0,3.513582
17,448,1,5.0,3.513582


In [17]:
df_final_test.shape

(20801, 4)

In [20]:
mse = mean_squared_error(df_final_test.value, df_final_test.training_mean)
print(f'The mean squared error is: {mse}')

The mean squared error is: 1.1828324899684037


### Singular Value Decomposition

Here we fill the sparse training matrix NaN's with the mean

In [21]:
train.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21,3.5,3.5,,,,,,,,5.0,...,,,,,,,,,,
583,,,,,,,,,,,...,,,,,,,,,,
398,,,,,,,,,,,...,,,,,,,,,,
459,,,,,,,,,,,...,,,,,,,,,,
343,,,,,,,,,,,...,,,,,,,,,,


In [22]:
# Impute and convert from df to array using .values
imputed_train = ratings_df.fillna(train_mean).values
imputed_train

array([[4.        , 3.51358156, 4.        , ..., 3.51358156, 3.51358156,
        3.51358156],
       [3.51358156, 3.51358156, 3.51358156, ..., 3.51358156, 3.51358156,
        3.51358156],
       [3.51358156, 3.51358156, 3.51358156, ..., 3.51358156, 3.51358156,
        3.51358156],
       ...,
       [2.5       , 2.        , 2.        , ..., 3.51358156, 3.51358156,
        3.51358156],
       [3.        , 3.51358156, 3.51358156, ..., 3.51358156, 3.51358156,
        3.51358156],
       [5.        , 3.51358156, 3.51358156, ..., 3.51358156, 3.51358156,
        3.51358156]])

In [23]:
U, s, VT = svds(imputed_train, k=50)

In [24]:
print(U.shape, s.shape, VT.shape)

(610, 50) (50,) (50, 9724)


In [25]:
Sigma = np.diag(s) 

In [26]:
B = U.dot(Sigma.dot(VT))
B.shape

(610, 9724)

In [27]:
df_B = pd.DataFrame(B)
df_B.index = ratings_df.index
df_B.columns = ratings_df.columns
df_B.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.90338,3.620638,3.804019,3.51648,3.610843,3.867939,3.411269,3.523542,3.585657,3.696212,...,3.521308,3.524278,3.518337,3.518337,3.521308,3.518337,3.521308,3.521308,3.521308,3.517901
2,3.549254,3.503687,3.473184,3.504699,3.533562,3.481297,3.483244,3.516779,3.510707,3.542553,...,3.513419,3.511179,3.515658,3.515658,3.513419,3.515658,3.513419,3.513419,3.513419,3.514218
3,3.312307,3.384739,3.419722,3.502327,3.444301,3.396172,3.39478,3.52013,3.525253,3.582507,...,3.517159,3.514476,3.519843,3.519843,3.517159,3.519843,3.517159,3.517159,3.517159,3.517728
4,3.330164,3.294295,3.368224,3.491586,3.52135,3.365281,3.392202,3.453728,3.587733,3.359371,...,3.509855,3.507645,3.512065,3.512065,3.509855,3.512065,3.509855,3.509855,3.509855,3.508798
5,3.74132,3.493662,3.485844,3.495246,3.501691,3.618234,3.527925,3.509422,3.495753,3.487842,...,3.514686,3.514253,3.515118,3.515118,3.514686,3.515118,3.514686,3.514686,3.514686,3.515185


In [28]:
tidy_B = pd.melt(df_B.reset_index(), id_vars='userId')
tidy_B.head()

Unnamed: 0,userId,movieId,value
0,1,1,3.90338
1,2,1,3.549254
2,3,1,3.312307
3,4,1,3.330164
4,5,1,3.74132


In [29]:
SVD_eval = pd.merge(df_final_test, tidy_B, how='left',
                   left_on=['userId', 'movieId'],
                   right_on=['userId', 'movieId'])
SVD_eval = SVD_eval.rename(columns={'value_x': 'test_values', 'value_y': 'SVD_values'})
SVD_eval.isna().sum()

userId           0
movieId          0
test_values      0
training_mean    0
SVD_values       0
dtype: int64

In [30]:
SVD_eval.head()

Unnamed: 0,userId,movieId,test_values,training_mean,SVD_values
0,359,1,4.0,3.513582,3.732106
1,559,1,5.0,3.513582,3.878513
2,82,1,2.5,3.513582,3.32281
3,456,1,5.0,3.513582,3.733287
4,448,1,5.0,3.513582,5.069874


In [31]:
svd_mse = mean_squared_error(SVD_eval.test_values, SVD_eval.SVD_values)
print(f'The RMSE for SVD is: {svd_mse}')

The RMSE for SVD is: 0.43996992953727465
