Model Based Collaborative Filtering
- Predicts the ratings that a user would give a movie based on past ratings
- 

In [152]:
import numpy as np 
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import SVD, KNNBasic, NMF, SlopeOne, CoClustering, BaselineOnly
from surprise import accuracy


Read in Datasets
- Ratings
- MoviesMetadata

In [153]:
ratings = pd.read_csv('moviesRatings/ratings_small.csv')

In [154]:
moviesMetadata = pd.read_csv('moviesRatings/movies_metadata.csv')

  moviesMetadata = pd.read_csv('moviesRatings/movies_metadata.csv')


Feature Cleaning

In [155]:
# Casting 'id' column as int type
moviesMetadata['id'] = pd.to_numeric(moviesMetadata['id'], errors='coerce').astype('Int64')

In [156]:
# Checking if there are any rows with empyty value for id for moviesMetadata
moviesMetadata[moviesMetadata['id'].isnull()]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,1,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,12,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,22,,,,,,,,,


In [157]:
# Dropping rows with empty
moviesMetadata = moviesMetadata.drop([19730, 29503, 35587])

In [158]:
# Number of users
print('The ratings dataset has', ratings['userId'].nunique(), 'unique users')

# Number of movies
print('The ratings dataset has', ratings['movieId'].nunique(), 'unique movies')

# Number of ratings
print('The ratings dataset has', ratings['rating'].nunique(), 'unique ratings')

# List of unique ratings
print('The unique ratings are', sorted(ratings['rating'].unique()))

The ratings dataset has 671 unique users
The ratings dataset has 9066 unique movies
The ratings dataset has 10 unique ratings
The unique ratings are [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]


In [159]:
ratings = ratings.drop(columns='timestamp')

ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [160]:
ratings.describe()

Unnamed: 0,userId,movieId,rating
count,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608
std,195.163838,26369.198969,1.058064
min,1.0,1.0,0.5
25%,182.0,1028.0,3.0
50%,367.0,2406.5,4.0
75%,520.0,5418.0,4.0
max,671.0,163949.0,5.0


Merge Ratings and MoviesMetadata together

In [161]:
movie_rating_df = pd.merge(moviesMetadata, ratings, how = 'inner', left_on='id', right_on='movieId')

Read the data using the Surprise Library

In [164]:
reader = Reader(rating_scale=(1,5))
surprise_data = Dataset.load_from_df(movie_rating_df[['userId',
                                                      'id',
                                                      'rating']] , reader)

Modelling
- Train-Test set split
- Using SVD model

In [165]:
trainset , testset = train_test_split(surprise_data, test_size = 0.25)

In [166]:
# Testing on SVD
svd_model = SVD()
svd_model.fit(trainset)
predictions = svd_model.test(testset)

In [167]:
accuracy.rmse(predictions)


RMSE: 0.8957


0.8956997482999489

In [169]:
svd_model.predict(uid=1, iid=2455, verbose=True)

user: 1          item: 2455       r_ui = None   est = 3.15   {'was_impossible': False}


Prediction(uid=1, iid=2455, r_ui=None, est=3.145321479418584, details={'was_impossible': False})

In [170]:
svd_model.predict(uid=1, iid=1371, verbose=True)


user: 1          item: 1371       r_ui = None   est = 2.60   {'was_impossible': False}


Prediction(uid=1, iid=1371, r_ui=None, est=2.600049056733666, details={'was_impossible': False})

Tuning

In [171]:
param_grid = {'n_epochs': [5, 10, 20],
              'lr_all': [0.002, 0.005, 0.007]}

In [172]:
gs = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=3,
                  n_jobs=-1,
                  joblib_verbose=True)

In [173]:
gs.fit(surprise_data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    2.8s finished


In [174]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.9044416278944584
{'n_epochs': 20, 'lr_all': 0.005}


In [175]:
svd_model.n_epochs

20

Testing again

In [176]:
surprise_train_full_data = surprise_data.build_full_trainset()

In [177]:
svd_model.fit(surprise_train_full_data)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x17320de20>

Recommendation Model
- Movies that users have not watched

In [178]:
def get_recommendation(movies_ratings,user_id,num):
    temp_dict={}

    movies_not_watched = movies_ratings["id"][~(movies_ratings["userId"] == user_id)].drop_duplicates().values.tolist()
    print(movies_not_watched)
    
    for movie in movies_not_watched:
        # Key: Movie ID, Value: Prediction
        temp_dict[movie] = svd_model.predict(uid=user_id, iid=movie)[3]
        
    suggestions = pd.DataFrame(temp_dict.items(),columns=["MovieId",'Predicted Rating']).sort_values(by="Predicted Rating", ascending=False).head(num)
    merged = pd.merge(suggestions, moviesMetadata[["id","title"]], how="left", left_on='MovieId', right_on="id")
    

    return merged

In [179]:
get_recommendation(movie_rating_df, 1 ,10).sort_values(by="title", ascending=False)

[949, 710, 1408, 524, 4584, 5, 8012, 451, 902, 63, 687, 577, 2054, 4482, 755, 880, 8447, 2086, 2045, 688, 103, 2074, 27793, 568, 414, 1873, 8839, 8963, 26564, 1572, 8973, 1642, 281, 3512, 26258, 76, 2292, 1909, 8984, 4954, 522, 1024, 628, 11, 3036, 8986, 1945, 241, 527, 6950, 101, 2307, 680, 110, 108, 2636, 2164, 278, 193, 95963, 1587, 2064, 236, 2759, 888, 13, 1689, 854, 2788, 8987, 8831, 315, 8011, 2654, 2758, 306, 178, 4722, 26391, 2124, 49299, 5503, 2087, 2019, 26271, 2259, 7984, 6, 329, 507, 1413, 2246, 2320, 5879, 2625, 788, 1245, 7007, 2088, 424, 8850, 695, 858, 867, 78, 319, 32562, 251, 280, 581, 80350, 268, 274, 408, 114, 7300, 275, 1592, 2021, 954, 8840, 5801, 896, 2300, 27768, 753, 8866, 922, 664, 532, 935, 7095, 627, 81949, 1645, 4837, 2925, 3573, 549, 238, 159, 3587, 981, 872, 2769, 164, 426, 567, 3078, 213, 239, 289, 963, 6620, 804, 909, 630, 599, 15, 62, 705, 223, 303, 4174, 381, 1859, 1939, 3598, 198, 220, 3529, 2897, 3083, 900, 260, 488, 261, 2033, 173, 864, 8367, 433,

Unnamed: 0,MovieId,Predicted Rating,id,title
4,2064,4.029075,2064,While You Were Sleeping
5,4973,4.010961,4973,Under the Sand
7,4235,3.967415,4235,The Sicilian Clan
1,318,4.189038,318,The Million Dollar Hotel
9,6016,3.940027,6016,The Good Thief
2,296,4.082626,296,Terminator 3: Rise of the Machines
0,858,4.216224,858,Sleepless in Seattle
3,1254,4.067524,1254,"Don't Worry, I'm Fine"
8,923,3.952891,923,Dawn of the Dead
6,898,3.977244,898,Birdman of Alcatraz


In [180]:
user1 = movie_rating_df[movie_rating_df["userId"] == 1]

In [181]:
user1[['userId', 'id', 'rating']]

Unnamed: 0,userId,id,rating
10561,1,1371,2.5
11376,1,2105,4.0
13727,1,2193,2.0
18245,1,2294,2.0
27973,1,1405,1.0
31731,1,2455,2.5


HALLLTTTT extra stuff

Using iteration to test multiple algorithms to test which gives the best result
- SVD
- KNNBasic
- NMF
- SlopeOne
- CoClustering
- BaselineOnly

In [182]:
algorithms = [SVD(), KNNBasic(), NMF(), SlopeOne(), CoClustering(), BaselineOnly()]

In [183]:
def algo_test(train, test, algos):
    # To store the Root-Mean-Square deviation (RMSE) values
    list1 = []
    # To store the Mean Absolute Error (MAE) values
    list2 = []

    for algo in algos:
        algo.fit(train)
        preds = algo.test(test)

        # Metrics to determine accuracy of test
        rmse = accuracy.rmse(preds)
        mae = accuracy.mae(preds)

        # Append to the respective lists
        list1.append(rmse)
        list2.append(mae)
    return list1, list2


In [184]:
rmse_vals = []
mae_vals = []

rmse_vals, mae_vals = algo_test(trainset, testset, algorithms)

RMSE: 0.8949
MAE:  0.6897
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9602
MAE:  0.7425
RMSE: 0.9425
MAE:  0.7227
RMSE: 0.9196
MAE:  0.7033
RMSE: 0.9540
MAE:  0.7405
Estimating biases using als...
RMSE: 0.8934
MAE:  0.6913


In [185]:
combined_acc = [(rmse + mae) / 2 for rmse, mae in zip(rmse_vals, mae_vals)]
best_algo_index = combined_acc.index(min(combined_acc))
best_algo_name = algorithms[best_algo_index].__class__.__name__
print(f"Best Algorithm: {best_algo_name}")
print(f"RMSE: {rmse_vals[best_algo_index]}")
print(f"MAE: {mae_vals[best_algo_index]}")
print(f"Combined Score: {combined_acc[best_algo_index]}")

Best Algorithm: SVD
RMSE: 0.8948956449791934
MAE: 0.6896848898604119
Combined Score: 0.7922902674198027


In [186]:
svd = SVD()
data_train = data.build_full_trainset()
svd.fit(data_train)
predictions = svd.test(test_set)

Model Tuning

In [187]:
param_grid = {'n_epochs': [5, 10, 20],
              'lr_all': [0.002, 0.005, 0.007]}

In [188]:
grid_search = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=3,
                  n_jobs=-1,
                  joblib_verbose=True)

In [189]:
grid_search.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    3.4s finished


In [190]:
best_rmse = grid_search.best_score['rmse']
best_params = grid_search.best_params['rmse']
print(f"The best RMSE score is: {best_rmse}")
print(f"The best RMSE params are: {best_params}")

The best RMSE score is: 0.9043080384501797
The best RMSE params are: {'n_epochs': 20, 'lr_all': 0.005}


In [191]:
svd.n_epochs

20

In [192]:
svd_model = SVD(**grid_search.best_params['rmse'])


In [193]:
data = data.build_full_trainset()

In [194]:
svd_model.fit(data_train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x17a46efc0>

In [195]:
svd_model.predict(uid=229, iid=16, verbose=True)

user: 229        item: 16         r_ui = None   est = 4.19   {'was_impossible': False}


Prediction(uid=229, iid=16, r_ui=None, est=4.192688656258422, details={'was_impossible': False})

In [196]:
def suggest(df,user_id,sug):
    
    didnt_watch = df["id"][~(df["userId"] == user_id)].drop_duplicates().values.tolist()
    temp_dict={}
    
    for i in didnt_watch:
        temp_dict[i] = svd_model.predict(uid=user_id, iid=i)[3]
        
    suggestions = pd.DataFrame(temp_dict.items(),columns=["id",'possible_rate']).sort_values(by="possible_rate", ascending=False).head(sug)
    merged = pd.merge(suggestions, moviesMetadata[["id","title"]], how="inner", on="id")
    
    return merged

In [197]:
moviesMetadata = moviesMetadata.drop([19730, 29503, 35587])
moviesMetadata['id'] = moviesMetadata['id'].astype('int')
movie_rating_df = pd.merge(moviesMetadata, ratings, how = 'inner', left_on='id', right_on='movieId')

KeyError: '[19730, 29503, 35587] not found in axis'

In [None]:
suggest(movie_rating_df,21,15).sort_values(by="possible_rate", ascending=False)


Unnamed: 0,id,possible_rate,title
0,318,4.264566,The Million Dollar Hotel
1,3683,4.179653,Flags of Our Fathers
2,1254,4.16954,"Don't Worry, I'm Fine"
3,953,4.157904,Madagascar
4,2064,4.141353,While You Were Sleeping
5,5618,4.138989,"Cousin, Cousine"
6,3022,4.132988,Dr. Jekyll and Mr. Hyde
7,926,4.130558,Galaxy Quest
8,1945,4.127052,Nell
9,2973,4.12447,Aelita: Queen of Mars
