In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
## 1. Reading Dataset 
#Reading ratings file:
ratings = pd.read_csv('/Users/paramanandbhat/Downloads/MatrixfactorizationBasedCollaborativeFilteringusingSurprise-201024-234535 (1)/ratings.csv')

#Reading Movie Info File
movie_info = pd.read_csv('/Users/paramanandbhat/Downloads/MatrixfactorizationBasedCollaborativeFilteringusingSurprise-201024-234535 (1)/movie_info.csv')

In [3]:
## 2.  Merging Movie information to ratings dataframe   

ratings = ratings.merge(movie_info[['movie id','movie title']], how='left', left_on = 'movie_id', right_on = 'movie id')

In [4]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,movie id,movie title
0,196,242,3,881250949,242,Kolya (1996)
1,186,302,3,891717742,302,L.A. Confidential (1997)
2,22,377,1,878887116,377,Heavyweights (1994)
3,244,51,2,880606923,51,Legends of the Fall (1994)
4,166,346,1,886397596,346,Jackie Brown (1997)


In [5]:
ratings['movie'] = ratings['movie_id'].map(str) + str(': ') + ratings['movie title'].map(str)

In [6]:
ratings.columns

Index(['user_id', 'movie_id', 'rating', 'unix_timestamp', 'movie id',
       'movie title', 'movie'],
      dtype='object')

In [7]:
ratings = ratings.drop(['movie id', 'movie title', 'movie_id','unix_timestamp'], axis = 1)

In [8]:
ratings = ratings[['user_id','movie','rating']]

In [9]:
## 3. Creating Train & Test Data & Setting Evaluation Metric

In [10]:
#Assign X as the original ratings dataframe
X = ratings.copy()
#Split into training and test datasets
X_train, X_test = train_test_split(X, test_size = 0.25, random_state=42)

In [11]:
#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [12]:
## 4. Importing Surprise & Loading Dataset

In [13]:
#Importing functions to be used in this notebook from Surprise Package
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV

In [14]:
#Reader object to import ratings from X_train
reader = Reader(rating_scale=(1, 5))

#Storing Data in surprise format from X_train
data = Dataset.load_from_df(X_train[['user_id','movie','rating']], reader)                          

In [15]:
 ## 5. Fitting SVD Model with 100 latent factors on train set and checking performance on test set 

In [16]:
# Train a new SVD with 100 latent features (number was chosen arbitrarily)
model = SVD(n_factors=100)


In [17]:
#Build full trainset will essentially fits the knnwithmeans on the complete train set instead of a part of it
#like we do in cross validation
model.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x15dd44190>

In [18]:
#id pairs for test set
id_pairs = zip(X_test['user_id'], X_test['movie'])

#Making predictions for test set using predict method from Surprise
y_pred = [model.predict(uid = user, iid = movie)[3] for (user, movie) in id_pairs]

#Actual rating values for test set
y_true = X_test['rating']

# Checking performance on test set
rmse(y_true, y_pred)

0.9391769808176333

In [19]:
## 6. Examining the user and item matrices

In [20]:
#Number of movies & users in train data
X_train.movie.nunique(), X_train.user_id.nunique()

(1642, 943)

In [21]:
# 1642*100 (movie matrix)  943*100 (user matrix) # 1642*943 (user movie matrix)
model.qi.shape, model.pu.shape,X_train.movie.nunique(), X_train.user_id.nunique() 

((1642, 100), (943, 100), 1642, 943)

In [22]:
#Percentage reduction in size wrt user item matrix
(1642*943 - 943*100 - 1642*100)/(1642*943)*100

83.30541214642672

In [27]:
#Extracting id for Toy story within qi matrix
movie_row_idx = model.trainset._raw2inner_id_items['1: Toy Story (1995)']
np.array(model.qi[movie_row_idx])

array([-1.48373825e-01,  7.59731191e-02, -3.63499073e-01,  1.86555928e-02,
        7.98321191e-02,  5.20103611e-02, -2.09626165e-01, -2.33814055e-01,
        1.11392323e-01,  1.67545664e-01,  5.51212068e-03,  3.30618514e-02,
       -1.44643475e-01,  1.03356788e-02,  1.28104661e-01, -1.52466854e-01,
       -6.92946998e-02,  5.26345963e-02, -8.81273866e-03, -4.28778615e-02,
       -2.64935495e-03,  3.46368158e-02,  1.19759381e-02, -7.60803946e-02,
       -1.11553784e-01,  1.47819589e-01,  6.42637914e-02, -3.78179309e-04,
       -1.00504392e-01,  8.25475928e-02,  1.47536665e-01,  6.96938131e-02,
       -4.78103976e-02,  1.81470307e-01,  2.32642355e-01,  3.35017106e-02,
        2.87273888e-01,  1.86843430e-01, -1.04564964e-01,  1.46762743e-01,
        1.16835686e-01,  6.22268587e-02,  2.16527234e-01,  3.64468933e-02,
        1.54183595e-02,  1.61228342e-01,  1.20047647e-01, -2.30187770e-01,
        6.35602517e-02,  3.39724297e-01, -1.88966206e-02,  8.50078835e-03,
        3.26026936e-02,  

In [24]:
#Latent factors learnt from Funk SVD
ts_vector = np.array(model.qi[movie_row_idx])

In [25]:
#Extracting id for Wizard of Oz within qi matrix
movie_row_idx = model.trainset._raw2inner_id_items['132: Wizard of Oz, The (1939)']
woz_vector = np.array(model.qi[movie_row_idx])

In [26]:
#Checking the similarity in latent factors for wizard of oz & Toy Story
from scipy import spatial
1 - spatial.distance.cosine(ts_vector,woz_vector)

0.05625509865856204

In [28]:
## 7. Grid Search for better performance with SVD

In [29]:
#Defining the parameter grid for SVD and fixing the random state
param_grid = {'n_factors':list(range(1,50,5)), 'n_epochs': [5, 10, 20], 'random_state': [42]}

#Defining the grid search with the parameter grid and SVD algorithm optimizing for RMSE
gs = GridSearchCV(SVD, 
                  param_grid, 
                  measures=['rmse'], 
                  cv=5, 
                  n_jobs = -1)

#Fitting the mo
gs.fit(data)
 
#Printing the best score
print(gs.best_score['rmse'])

#Printing the best set of parameters
print(gs.best_params['rmse'])

0.9441258312059828
{'n_factors': 11, 'n_epochs': 20, 'random_state': 42}


In [30]:
#Fitting the model on train data with the best parameters
model = SVD(n_factors = 11, n_epochs = 20, random_state = 42)

#Build full trainset will essentially fits the SVD on the complete train set instead of a part of it
#like we do in cross validation for grid search
model.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x168e808d0>

In [31]:
# aid pairs for test set
id_pairs = zip(X_test['user_id'], X_test['movie'])

#Making predictions for test set using predict method from Surprise
y_pred = [model.predict(uid = user, iid = movie)[3] for (user, movie) in id_pairs]

#Actual rating values for test set
y_true = X_test['rating']

# Checking performance on test set
rmse(y_true, y_pred)

0.9390125163978547