# Model-Based Collaborative Filtering: Matrix Factorization

In [1]:
import pandas as pd
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

In [2]:
# Adım 1: Veri Setinin Hazırlanması
# Adım 2: Modelleme
# Adım 3: Model Tuning
# Adım 4: Final Model ve Tahmin

# Adım 1: Veri Setinin Hazırlanması

In [5]:
movie = pd.read_csv('/Users/mericgenc/Desktop/Python - Data Science/Miuul Data Scientist Path/04 - Recommendation Systems/00_datasets/movie_lens_dataset/movie.csv')
rating = pd.read_csv('/Users/mericgenc/Desktop/Python - Data Science/Miuul Data Scientist Path/04 - Recommendation Systems/00_datasets/movie_lens_dataset/rating.csv')

df = movie.merge(rating, how='left', on='movieId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [6]:
movie_ids = [130219, 356, 4422, 541]
movies = ["The Dark Knight (2011)",
          "Cries and Whispers (Viskningar och rop) (1972)",
          "Forrest Gump (1994)",
          "Blade Runner (1982)"]

In [7]:
sample_df = df[df.movieId.isin(movie_ids)]
sample_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
2457839,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.0,4.0,1996-08-24 09:28:42
2457840,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7.0,4.0,2002-01-16 19:02:55
2457841,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8.0,5.0,1996-06-05 13:44:19
2457842,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9.0,4.0,2001-07-01 20:26:38
2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


In [8]:
sample_df.shape

(97343, 6)

In [9]:
user_movie_df = sample_df.pivot_table(index=["userId"],
                                      columns=["title"],
                                      values="rating")

In [10]:
reader = Reader(rating_scale=(1,5))

In [11]:
data = Dataset.load_from_df(sample_df[['userId','movieId','rating']], reader)

# Adım 2: Modelleme

In [14]:
trainset, testset = train_test_split(data, test_size=0.25)

In [15]:
svd_model = SVD()

In [16]:
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f87047c70d0>

In [17]:
predictions = svd_model.test(testset)

In [18]:
predictions

[Prediction(uid=88951.0, iid=356, r_ui=4.0, est=4.190015132164808, details={'was_impossible': False}),
 Prediction(uid=105054.0, iid=356, r_ui=4.0, est=4.190015132164808, details={'was_impossible': False}),
 Prediction(uid=72085.0, iid=356, r_ui=4.5, est=4.108442741553963, details={'was_impossible': False}),
 Prediction(uid=137586.0, iid=356, r_ui=2.0, est=4.190015132164808, details={'was_impossible': False}),
 Prediction(uid=70338.0, iid=356, r_ui=4.0, est=4.190015132164808, details={'was_impossible': False}),
 Prediction(uid=94054.0, iid=356, r_ui=5.0, est=4.190015132164808, details={'was_impossible': False}),
 Prediction(uid=17440.0, iid=541, r_ui=4.5, est=4.460266098209834, details={'was_impossible': False}),
 Prediction(uid=18765.0, iid=541, r_ui=3.5, est=4.1649010377342055, details={'was_impossible': False}),
 Prediction(uid=4229.0, iid=356, r_ui=3.0, est=4.190015132164808, details={'was_impossible': False}),
 Prediction(uid=128901.0, iid=356, r_ui=3.5, est=4.190015132164808, det

In [19]:
accuracy.rmse(predictions)

RMSE: 0.9448


0.9448486477310569

In [20]:
svd_model.predict(uid=1.0, iid=541, verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.04   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.043257673615218, details={'was_impossible': False})

# Adım 3: Model Tuning

In [34]:
param_grid = {'n_factors':[10,20,30,40,100],
              'n_epochs': [5, 10, 20],
              'lr_all': [0.002, 0.005, 0.007]}

In [35]:
gs = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=3,
                  n_jobs=-1,
                  joblib_verbose=True)

In [36]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   15.6s finished


In [37]:
gs.best_score['rmse']

0.9304368380542666

In [38]:
gs.best_params['rmse']

{'n_factors': 20, 'n_epochs': 10, 'lr_all': 0.002}

# Adım 4: Final Modeli ve Tahmin

In [39]:
svd_model = SVD(**gs.best_params['rmse'])

In [40]:
data = data.build_full_trainset()
svd_model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f8700c0c700>

In [41]:
svd_model.predict(uid=1.0, iid=541, verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.21   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.211669059227884, details={'was_impossible': False})

In [42]:
for i in movie_ids:
    svd_model.predict(uid=1.0, iid=i, verbose=True)

user: 1.0        item: 130219     r_ui = None   est = 4.11   {'was_impossible': False}
user: 1.0        item: 356        r_ui = None   est = 4.05   {'was_impossible': False}
user: 1.0        item: 4422       r_ui = None   est = 4.03   {'was_impossible': False}
user: 1.0        item: 541        r_ui = None   est = 4.21   {'was_impossible': False}
