In [84]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.linalg import svd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Project 3 - Matrix Factorization

In [2]:
# Singular-value decomposition
from numpy import array

# define a matrix
A = array([[1,2], [3,4], [5,6]])
print(A)

#SVD
U, s, VT = svd(A)
print(U)
print(s)
print(VT)

[[1 2]
 [3 4]
 [5 6]]
[[-0.2298477   0.88346102  0.40824829]
 [-0.52474482  0.24078249 -0.81649658]
 [-0.81964194 -0.40189603  0.40824829]]
[9.52551809 0.51430058]
[[-0.61962948 -0.78489445]
 [-0.78489445  0.61962948]]


In [3]:
# Reconstruct SVD
from numpy import diag
from numpy import dot
from numpy import zeros

print(A)
# define a matrix
Sigma = zeros((A.shape[0], A.shape[1]))
# populate Sigma with n x n diagonal matrix
Sigma[:A.shape[1], :A.shape[1]] = diag(s)
# reconstruct matrix
B = U.dot(Sigma.dot(VT))
print(B)

[[1 2]
 [3 4]
 [5 6]]
[[1. 2.]
 [3. 4.]
 [5. 6.]]


In [4]:
A = array([[1,2,3], [4,5,6], [7,8,9]])
print(A)
# Singular value decomposition
U, s, VT = svd(A)
# create n x n Sigma matrix
Sigma = diag(s)
# reconstruct matrix
B = U.dot(Sigma.dot(VT))
print(B)

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]


Since the MovieLens dataset deals with implicit data, a very sparse matrix, Singular-Value Decomposition is not the most effective method. SVD just assigns a predicted rating by imputation to missing data, but there is more information behind that missing data when the data is implicit. The user could love the item but not know about it. In this case Alternating Least Squares (ALS) is used, which is an interative process which tries to get closer and closer to a factorized representation of the original data.

ALS essentially is fitting a line, taking the sum of squares, and iterating over and over trying to minimize this value.

In [68]:
# ratings = pd.read_csv('https://raw.githubusercontent.com/mjdacs/data612/master/project_2/ml-latest-small/ratings.csv')
# movies = pd.read_csv('https://raw.githubusercontent.com/mjdacs/data612/master/project_2/ml-latest-small/movies.csv', index_col='movieId')
# tags = pd.read_csv('https://raw.githubusercontent.com/mjdacs/data612/master/project_2/ml-latest-small/tags.csv')

ratings = pd.read_csv('C:\\Users\\1239783\\Python\\data612-master\\project_2\\ml-latest-small\\ratings.csv')
movies = pd.read_csv('C:\\Users\\1239783\\Python\\data612-master\\project_2\\ml-latest-small\\movies.csv', index_col='movieId')
tags = pd.read_csv('C:\\Users\\1239783\\Python\\data612-master\\project_2\\ml-latest-small\\tags.csv')

In [99]:
ratings.rating.isna().sum()

0

In [94]:
ratings_df = ratings.pivot(index='userId', columns='movieId', values='rating')
ratings_df.head()


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [95]:
ratings.rating.mean()

3.501556983616962

In [100]:
ratings_train, ratings_test = train_test_split(ratings, test_size=0.2)

In [101]:
ratings_test.head()

Unnamed: 0,userId,movieId,rating,timestamp
20059,132,2291,4.0,1329983923
81163,514,1387,5.0,1534052825
70890,453,608,4.0,972622467
35234,237,6373,3.0,1410632204
38082,260,5782,3.5,1109410682


In [102]:
ratings_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
44351,294,3363,4.0,966595332
57786,380,57669,5.0,1493420622
93970,599,4992,1.5,1498504941
78747,489,2455,3.5,1332774778
16411,105,5026,4.0,1446773487


In [110]:
training_mean = ratings_train.rating.mean()

In [114]:
ratings_test['train_mean'] = training_mean
ratings_test = ratings_test.drop(columns=['timestamp'])
ratings_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,userId,movieId,rating,train_mean
20059,132,2291,4.0,3.50124
81163,514,1387,5.0,3.50124
70890,453,608,4.0,3.50124
35234,237,6373,3.0,3.50124
38082,260,5782,3.5,3.50124


In [115]:
from sklearn.metrics import mean_squared_error
mean_squared_error(ratings_test.rating, ratings_test.train_mean)

1.0800718798483653

In [89]:
# Convert to numpy array
R_matrix = ratings_df.values
ratings_mean = np.mean(R_matrix, axis=1)
R_norm = ratings_mean.reshape(-1, 1)
R_norm.shape

(610, 1)

In [91]:
U, sigma, Vt = svds(R_norm)

ValueError: k must be between 1 and min(A.shape), k=6

In [58]:
from sklearn.impute import SimpleImputer
imputed_matrix = csr_matrix(sparse_ratings.values)

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(imputed_matrix)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)