# Model Based Matrix Factorization

In [13]:
# libraries

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

pd.set_option("display.max_columns",None)
pd.set_option("display.width",500)
sns.set(rc={"figure.figsize":(12,12)})

# Preperation of Data

In [14]:
movie = pd.read_csv("datas/movie.csv")
rating = pd.read_csv("datas/rating.csv")
data = movie.merge(rating, how="left", on="movieId")
data.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [15]:
movie_ids = [130219, 356, 4422, 541]
movie_names = ["The Dark Knight (2011)",
               "Cries and Whispers (Viskningar och rop) (1972)",
               "Forrest Gump (1994)",
               "Blade Runner (1982)"]


sample_data = data[data.movieId.isin(movie_ids)]
print(sample_data.shape)
sample_data.head()

(97343, 6)


Unnamed: 0,movieId,title,genres,userId,rating,timestamp
2457839,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.0,4.0,1996-08-24 09:28:42
2457840,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7.0,4.0,2002-01-16 19:02:55
2457841,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8.0,5.0,1996-06-05 13:44:19
2457842,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9.0,4.0,2001-07-01 20:26:38
2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


In [16]:
user_movie_df = sample_data.pivot_table(index=["userId"], columns=["title"],values="rating")
print(user_movie_df.shape)
user_movie_df.head()

(76918, 4)


title,Blade Runner (1982),Cries and Whispers (Viskningar och rop) (1972),Forrest Gump (1994),The Dark Knight (2011)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,4.0,,,
2.0,5.0,,,
3.0,5.0,,,
4.0,,,4.0,
7.0,,,4.0,


In [17]:
reader = Reader(rating_scale=(1,5))

In [18]:
data = Dataset.load_from_df(sample_data[["userId",
                                        "movieId",
                                         "rating"]], reader)

# Modeling

In [19]:
trainset, testset = train_test_split(data,test_size=.25)

svd_model = SVD()
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x18d41c45190>

In [20]:
predictions = svd_model.test(testset)

In [21]:
predictions[:10]

[Prediction(uid=42498.0, iid=541, r_ui=4.5, est=3.800839272203745, details={'was_impossible': False}),
 Prediction(uid=54214.0, iid=356, r_ui=3.0, est=4.08410235773904, details={'was_impossible': False}),
 Prediction(uid=113602.0, iid=356, r_ui=5.0, est=4.08410235773904, details={'was_impossible': False}),
 Prediction(uid=2995.0, iid=356, r_ui=5.0, est=4.08410235773904, details={'was_impossible': False}),
 Prediction(uid=46173.0, iid=541, r_ui=5.0, est=4.097224264948224, details={'was_impossible': False}),
 Prediction(uid=60371.0, iid=356, r_ui=4.0, est=4.08410235773904, details={'was_impossible': False}),
 Prediction(uid=113168.0, iid=356, r_ui=5.0, est=4.08410235773904, details={'was_impossible': False}),
 Prediction(uid=31232.0, iid=356, r_ui=3.0, est=4.08410235773904, details={'was_impossible': False}),
 Prediction(uid=22225.0, iid=356, r_ui=3.0, est=4.08410235773904, details={'was_impossible': False}),
 Prediction(uid=123533.0, iid=356, r_ui=2.0, est=4.08410235773904, details={'wa

The real ratings are *r_ui* and estimated ratings are *est*.

In [22]:
accuracy = accuracy.rmse(predictions)
accuracy

RMSE: 0.9287


0.9286616192913335

In [24]:
svd_model.predict(uid = 1.0,iid=541, verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.10   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.097224264948224, details={'was_impossible': False})

In [25]:
sample_data[sample_data["userId"] == 1.0]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
3612352,541,Blade Runner (1982),Action|Sci-Fi|Thriller,1.0,4.0,2005-04-02 23:30:03


The first user did rate **Blade Runner** as 4.0 and estimated rate is 4.10.

# Model Tuning

In [27]:
param_grid = {"n_epochs":[5,10,15],
             "lr_all": [0.002,0.005,0.007]}

grid_search = GridSearchCV(SVD, param_grid, measures=["rmse","mae"], cv = 3, n_jobs=-1, joblib_verbose=True)
grid_search.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    6.6s finished


In [28]:
grid_search.best_score["rmse"]

0.9317844017737613

In [29]:
grid_search.best_params["rmse"]

{'n_epochs': 5, 'lr_all': 0.002}

# Final Model and Prediction

In [30]:
svd_model = SVD(**grid_search.best_params["rmse"]) # the new model with best parameters

data = data.build_full_trainset()
svd_model.fit(data)
svd_model.predict(uid = 1.0,iid=541, verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.19   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.193686713022256, details={'was_impossible': False})