## Movie Recommendation Model using  SVD Matrix Factorization

#### Process Overview
- Step 1: Load MovieLens dataset and perform exploratory data analysis
- Step 2: Prep data for Singular Value Decomposition (SVD) matrix factorization
- Step 3: Build SVD machine learning model to predict users movie ratings
- Step 4: Predict movie ratings for all users using the SVD model 
- Step 5: Check performance metrics of the model
- Step 6: Generate movie recommendations for existing users
- Step 7: Generate movie recommendations for a new user

### Step 1: Load MovieLens dataset and perform exploratory data analysis

In [1]:
### Load python modules and packages

import os
import pandas as pd
import numpy as np

In [2]:
### Load MovieLens data

file_directory = os.path.join(os.getcwd(), 'data_folder')
ratings_df = pd.read_csv(os.path.join(file_directory, 'ml-latest-small_ratings.csv'))
movies_df = pd.read_csv(os.path.join(file_directory, 'ml-latest-small_movies.csv'))


### Set data format

ratings_df['userId'] = ratings_df['userId'].astype(np.int32)
ratings_df['movieId'] = ratings_df['movieId'].astype(np.int32)
ratings_df['rating'] = ratings_df['rating'].astype(np.float32)
ratings_df.drop('timestamp', axis=1, inplace=True)

ratings_df.head(5)

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [3]:
ratings_df.describe()

Unnamed: 0,userId,movieId,rating
count,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608
std,195.163838,26369.198969,1.058064
min,1.0,1.0,0.5
25%,182.0,1028.0,3.0
50%,367.0,2406.5,4.0
75%,520.0,5418.0,4.0
max,671.0,163949.0,5.0


In [4]:
movies_df['movieId'] = movies_df['movieId'].apply(pd.to_numeric).astype(np.int32)

movies_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Step 2: Prep data for Singular Value Decomposition Matrix Factorization

In [5]:
## step 1: pivot ratings_df

R_df = ratings_df.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)


## step 2: de-mean the data (to center or normalize by each user's mean)

R = R_df.to_numpy().astype(np.float32) # R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)

R_demeaned = R - user_ratings_mean.reshape(-1, 1)

R_demeaned[0:3]

array([[-0.00562541, -0.00562541, -0.00562541, ..., -0.00562541,
        -0.00562541, -0.00562541],
       [-0.02923009, -0.02923009, -0.02923009, ..., -0.02923009,
        -0.02923009, -0.02923009],
       [-0.02007501, -0.02007501, -0.02007501, ..., -0.02007501,
        -0.02007501, -0.02007501]], dtype=float32)

### Step 3: Build SVD machine learning model to predict users movie ratings

In [6]:
### Singular Value Decomposition

from scipy.sparse.linalg import svds

U, S, Vt = svds(R_demeaned, k=50) # k=Number of singular values/vectors to compute


## convert S to the diagonal matrix form

S = np.diag(S)

S.shape

(50, 50)

In [7]:
S[0:3]

array([[67.60554,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ],
       [ 0.     , 67.90197,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ,  0.     ,  0.     ,  0.     ,
         0.     ,  

### Step 4: Predict movie ratings for all users using the SVD model 

In [8]:
### Make predictions for all users from the decomposed matrices

predicted_ratings_all = np.dot(np.dot(U, S), Vt) + user_ratings_mean.reshape(-1, 1)

preds_df_all = pd.DataFrame(predicted_ratings_all, columns = R_df.columns)

preds_df_all.head(2)

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.054239,0.04513,-0.004834,-0.019817,-0.011284,0.041373,-0.007822,-0.017188,0.012246,0.03767,...,-0.005258,-0.005453,0.012369,-0.004991,-0.004639,-0.019055,0.021403,-0.006365,-0.006098,-0.004819
1,0.419836,1.40644,-0.188809,0.156658,0.268031,0.414697,0.052171,0.044728,-0.020198,2.220256,...,-0.005909,-0.003974,-0.012555,-0.003555,-0.002711,-0.071622,-0.016212,0.001047,-0.001468,-0.006577


In [9]:
preds_df_all.tail(2)

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
669,1.581882,0.084578,-0.046355,-0.013776,0.180043,0.742556,0.004087,-0.009703,-0.016262,0.240633,...,0.005183,0.005732,0.011723,0.00669,0.007145,-0.035749,0.014695,0.00731,0.006698,0.004585
670,3.507893,0.328824,-0.067418,0.103239,0.036801,-0.243951,0.084767,0.035271,-0.398994,0.075671,...,-0.004248,-0.001536,0.001067,0.002523,-0.000385,0.111286,-0.002568,0.012777,0.011002,-0.016834


### Step 5: Check performance metrics of the model

In [10]:
# Check RMSE evaluation metric for the model predictions

from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(R, predicted_ratings_all, squared=False)

print(f'RMSE = {rmse:.4f}')

RMSE = 0.2260


### Step 6: Generate movie recommendations for existing users

In [11]:
### Generate movie recommendations for a selected user

def movie_recommender(movies_df, ratings_df_all_users, pred_df_all_users, userId, num_recommend=10):            
    ## user_index = userId - 1 because userId starts from 1, while row index starts from 0 
    user_index = userId - 1 
    pred_df_user_sorted = pred_df_all_users.iloc[user_index].sort_values(ascending=False)

    ## Get the user's ratings data and merge with movies_df (movie info)
    user_ratings = ratings_df_all_users[ratings_df_all_users.userId == (userId)]
    user_movie_ratings = user_ratings.merge(movies_df, how = 'left', left_on = 'movieId', 
                right_on = 'movieId').sort_values(['rating'], ascending=False)

    ## Recommend movies with the highest predicted ratings that the user hasn't seen yet
    recommendations = movies_df[~movies_df['movieId'].isin(user_movie_ratings['movieId'])]\
            .merge(pd.DataFrame(pred_df_user_sorted).reset_index(), 
                        how = 'left', left_on = 'movieId', right_on = 'movieId')\
            .rename(columns = {user_index: 'predictions'})\
            .sort_values('predictions', ascending = False)\
            .iloc[:num_recommend, :-1]

    return user_movie_ratings, recommendations


user_1_movie_ratings, user_1_recommendations = movie_recommender(movies_df, ratings_df, 
                                                                 preds_df_all, 1, 10)

# movie ratings that were provided by userId=1
user_1_movie_ratings

Unnamed: 0,userId,movieId,rating,title,genres
4,1,1172,4.0,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama
13,1,2105,4.0,Tron (1982),Action|Adventure|Sci-Fi
12,1,1953,4.0,"French Connection, The (1971)",Action|Crime|Thriller
8,1,1339,3.5,Dracula (Bram Stoker's Dracula) (1992),Fantasy|Horror|Romance|Thriller
19,1,3671,3.0,Blazing Saddles (1974),Comedy|Western
1,1,1029,3.0,Dumbo (1941),Animation|Children|Drama|Musical
2,1,1061,3.0,Sleepers (1996),Thriller
14,1,2150,3.0,"Gods Must Be Crazy, The (1980)",Adventure|Comedy
17,1,2455,2.5,"Fly, The (1986)",Drama|Horror|Sci-Fi|Thriller
0,1,31,2.5,Dangerous Minds (1995),Drama


In [12]:
# Top 10 movies that are recommended to userId=1
user_1_recommendations

Unnamed: 0,movieId,title,genres
1103,1374,Star Trek II: The Wrath of Khan (1982),Action|Adventure|Sci-Fi|Thriller
1503,1954,Rocky (1976),Drama
2379,2987,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...
2533,3175,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi
2759,3479,Ladyhawke (1985),Adventure|Fantasy|Romance
2348,2947,Goldfinger (1964),Action|Adventure|Thriller
1899,2406,Romancing the Stone (1984),Action|Adventure|Comedy|Romance
1030,1282,Fantasia (1940),Animation|Children|Fantasy|Musical
1506,1957,Chariots of Fire (1981),Drama
953,1201,"Good, the Bad and the Ugly, The (Buono, il bru...",Action|Adventure|Western


In [13]:
## Create a reader-friendly movie recommendations table

# rename the columns
user_1_recommendations_df = user_1_recommendations[['title','genres']] \
                        .rename(columns = {'title' : 'Movie Title','genres' : 'Genres'})

# shift the starting index value from 0 to 1
user_1_recommendations_df.index = np.arange(1, len(user_1_recommendations_df) + 1)

user_1_recommendations_df

Unnamed: 0,Movie Title,Genres
1,Star Trek II: The Wrath of Khan (1982),Action|Adventure|Sci-Fi|Thriller
2,Rocky (1976),Drama
3,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...
4,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi
5,Ladyhawke (1985),Adventure|Fantasy|Romance
6,Goldfinger (1964),Action|Adventure|Thriller
7,Romancing the Stone (1984),Action|Adventure|Comedy|Romance
8,Fantasia (1940),Animation|Children|Fantasy|Musical
9,Chariots of Fire (1981),Drama
10,"Good, the Bad and the Ugly, The (Buono, il bru...",Action|Adventure|Western


### Step 7: Generate movie recommendations for a new user

#### Step 7.1: Add new user data to the existing ratings data

In [14]:
# Let's assume a new user's favorite movies are the following:
# Toy Story (1995), Lion King, The (1994), Shrek (2001), Finding Nemo (2003) 
# which can be converted as a list of movieIds as below:

selected = [1, 364, 4306, 6377]


### Add new user's data to ratings_df: 

## step 1: transform new user's data into a dataframe
#df = pd.DataFrame(columns=["userId", "movieId", "rating"])

new_user_id = int(ratings_df[-1:]['userId']) + 1  #user_id = 672
new_user_rating = 5.0

new_user_movie_ids = selected  # eg., selected = [1, 364, 4306, 6377]

new_user_data = []
for movie_id in new_user_movie_ids:
    new_user_data.append([new_user_id, int(movie_id), new_user_rating])

# new user data in dataframe format
new_user_df = pd.DataFrame(data = new_user_data, 
                           columns = ["userId", "movieId", "rating"])


## step 2: add new_user_df to ratings_df

ratings_df_all = pd.concat([ratings_df, new_user_df], axis=0)

ratings_df_all.describe()

Unnamed: 0,userId,movieId,rating
count,100008.0,100008.0,100008.0
mean,347.024308,12548.272928,3.543667
std,195.170757,26368.74972,1.058083
min,1.0,1.0,0.5
25%,182.0,1028.0,3.0
50%,367.0,2406.5,4.0
75%,520.0,5418.0,4.0
max,672.0,163949.0,5.0


#### Step 7.3: Build SVD machine learning model to predict users movie ratings

In [15]:
### Prep data for Singular Value Decomposition

## step 1: pivot ratings_df_all

R_df = ratings_df_all.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)

## step 2: de-mean the data (to center or normalize by each user's mean)

R = R_df.to_numpy().astype(np.float32) # R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)


### Singular Value Decomposition
U, S, Vt = svds(R_demeaned, k = 50)

## convert S to the diagonal matrix form
S = np.diag(S)


#### Step 7.4: Predict movie ratings for all users using the SVD model

In [16]:
### Make predictions for all users from the decomposed matrices

predicted_ratings_all = np.dot(np.dot(U, S), Vt) + user_ratings_mean.reshape(-1, 1)

preds_df_all = pd.DataFrame(predicted_ratings_all, columns = R_df.columns)

preds_df_all.tail(2)

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
670,3.515325,0.328095,-0.067937,0.102718,0.037114,-0.243745,0.083728,0.035092,-0.39961,0.075458,...,-0.004279,-0.001555,0.000999,0.002514,-0.000386,0.111181,-0.002657,0.012742,0.01097,-0.016872
671,1.097249,0.287423,0.169659,-0.009603,0.034854,0.053165,0.030104,0.037564,0.001634,-0.013651,...,-0.0012,3.6e-05,0.003627,-0.002207,-0.003409,0.038213,0.005341,0.000918,0.00063,-0.002598


#### Step 7.5: Check performance metrics of the model

In [17]:
# Check RMSE evaluation metric for the model predictions

# from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(R, predicted_ratings_all, squared=False)

print(f'RMSE = {rmse:.4f}')

RMSE = 0.2258


#### Step 7.6: Generate movie recommendations for existing users

In [18]:
user_1_movie_ratings, user_1_recommendations = movie_recommender(movies_df, ratings_df_all, 
                                                                     preds_df_all, 1, 10)

# movie ratings that were provided by userId=1
user_1_movie_ratings

Unnamed: 0,userId,movieId,rating,title,genres
4,1,1172,4.0,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama
13,1,2105,4.0,Tron (1982),Action|Adventure|Sci-Fi
12,1,1953,4.0,"French Connection, The (1971)",Action|Crime|Thriller
8,1,1339,3.5,Dracula (Bram Stoker's Dracula) (1992),Fantasy|Horror|Romance|Thriller
19,1,3671,3.0,Blazing Saddles (1974),Comedy|Western
1,1,1029,3.0,Dumbo (1941),Animation|Children|Drama|Musical
2,1,1061,3.0,Sleepers (1996),Thriller
14,1,2150,3.0,"Gods Must Be Crazy, The (1980)",Adventure|Comedy
17,1,2455,2.5,"Fly, The (1986)",Drama|Horror|Sci-Fi|Thriller
0,1,31,2.5,Dangerous Minds (1995),Drama


In [19]:
# Top 10 movies that are recommended to userId=1
user_1_recommendations

Unnamed: 0,movieId,title,genres
1103,1374,Star Trek II: The Wrath of Khan (1982),Action|Adventure|Sci-Fi|Thriller
1503,1954,Rocky (1976),Drama
2379,2987,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...
2533,3175,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi
2759,3479,Ladyhawke (1985),Adventure|Fantasy|Romance
2348,2947,Goldfinger (1964),Action|Adventure|Thriller
1899,2406,Romancing the Stone (1984),Action|Adventure|Comedy|Romance
1030,1282,Fantasia (1940),Animation|Children|Fantasy|Musical
1506,1957,Chariots of Fire (1981),Drama
953,1201,"Good, the Bad and the Ugly, The (Buono, il bru...",Action|Adventure|Western


In [20]:
## Create a reader-friendly movie recommendations table

user_1_recommendations_df = user_1_recommendations[['title','genres']] \
                        .rename(columns = {'title' : 'Movie Title','genres' : 'Genres'})

# shift the starting index value from 0 to 1
user_1_recommendations_df.index = np.arange(1, len(user_1_recommendations_df) + 1)

user_1_recommendations_df

Unnamed: 0,Movie Title,Genres
1,Star Trek II: The Wrath of Khan (1982),Action|Adventure|Sci-Fi|Thriller
2,Rocky (1976),Drama
3,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...
4,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi
5,Ladyhawke (1985),Adventure|Fantasy|Romance
6,Goldfinger (1964),Action|Adventure|Thriller
7,Romancing the Stone (1984),Action|Adventure|Comedy|Romance
8,Fantasia (1940),Animation|Children|Fantasy|Musical
9,Chariots of Fire (1981),Drama
10,"Good, the Bad and the Ugly, The (Buono, il bru...",Action|Adventure|Western


#### Step 7.7: Retrieve movie recommendations for the new user

In [21]:
new_user_movie_ratings, new_user_recommendations = movie_recommender(movies_df, ratings_df_all, 
                                                                     preds_df_all, new_user_id, 10)

# movie ratings that were provided by userId=1
new_user_movie_ratings

Unnamed: 0,userId,movieId,rating,title,genres
0,672,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,672,364,5.0,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
2,672,4306,5.0,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
3,672,6377,5.0,Finding Nemo (2003),Adventure|Animation|Children|Comedy


In [22]:
# Top 10 movies that are recommended to userId=1
new_user_recommendations

Unnamed: 0,movieId,title,genres
5622,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy
2504,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
3802,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
519,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
525,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
2210,2762,"Sixth Sense, The (1999)",Drama|Horror|Mystery
1864,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
5388,8360,Shrek 2 (2004),Adventure|Animation|Children|Comedy|Musical|Ro...
4392,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
4689,6539,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy


In [23]:
## Create a reader-friendly movie recommendations table

new_user_recommendations_df = new_user_recommendations[['title','genres']] \
                        .rename(columns = {'title' : 'Movie Title','genres' : 'Genres'})

# shift the starting index value from 0 to 1
new_user_recommendations_df.index = np.arange(1, len(new_user_recommendations_df) + 1)

new_user_recommendations_df

Unnamed: 0,Movie Title,Genres
1,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy
2,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
3,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
4,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
5,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
6,"Sixth Sense, The (1999)",Drama|Horror|Mystery
7,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
8,Shrek 2 (2004),Adventure|Animation|Children|Comedy|Musical|Ro...
9,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
10,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy
