## Movie Lens Recommendation Engine

Using GroupLens Movie Database https://files.grouplens.org/datasets/movielens/ml-25m-README.html

In [73]:
import pandas as pd
import numpy as np

In [105]:
# read movie ratings dataset, take a sample data of 10000
movie_ratings_df = pd.read_csv("C:\\Lenovo\\Learn\\LinkedIn Learning\\ml-25m\\ratings.csv", header = 0, nrows=50000
                               , usecols=['userId', 'movieId', 'rating'])

In [106]:
# convert ratings to a 10 point scale
movie_ratings_df['rating'] = np.array(movie_ratings_df['rating']) * 2
movie_ratings_df.head(5)

Unnamed: 0,userId,movieId,rating
0,1,296,10.0
1,1,306,7.0
2,1,307,10.0
3,1,665,10.0
4,1,899,7.0


In [216]:
# read movies dataset - 62423 movies
movies_df = pd.read_csv("C:\\Lenovo\\Learn\\LinkedIn Learning\\ml-25m\\movies.csv", header=0)

In [220]:
# select only those movies which are available in ratings dataset
movies_df = movies_df[movies_df['movieId'].isin(movie_ratings_df['movieId'])]

In [222]:
#movies_df.set_index('movieId', inplace=True)
movies_df.reset_index(inplace=True,drop=True)
print(movies_df.shape)
movies_df.head(5)

(6489, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Map Users to All Movies based on ratings
A good view of the data would be to have users mapped to movies in a matrix. This will be a sparse matrix since maximum values would be null

In [110]:
# create movie-user matrix with ratings dataset
ratings_df = pd.pivot_table(movie_ratings_df, index='userId', columns='movieId', aggfunc=np.max)

In [111]:
# Since only a subset of users are considered, not all movies are available
print(ratings_df.shape)
ratings_df.head(5)

(406, 6489)


Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,2,3,4,5,6,7,9,10,11,...,203218,203244,203375,203513,203519,203649,204542,204692,204698,205106
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,,,,,...,,,,,,,,,,
2,7.0,,,,,,,,,,...,,,,,,,,,,
3,8.0,,,,,,,,,,...,,,,,,,,,,
4,6.0,,,,,,,,,,...,,,9.0,,5.0,,,,,
5,8.0,,,,,,,,,,...,,,,,,,,,,


Now, we need an optimization function that will reside in utilities file. Source: https://www.linkedin.com/learning/machine-learning-and-ai-foundations-recommendations

In [114]:
%%writefile matrix_factorization_utilities.py

import numpy as np
from scipy.optimize import fmin_cg

def normalize_ratings(ratings):
    """
    Given an array of user ratings, subtract the mean of each product's ratings
    :param ratings: 2d array of user ratings
    :return: (normalized ratings array, the calculated means)
    """
    mean_ratings = np.nanmean(ratings, axis=0)
    return ratings - mean_ratings, mean_ratings

def cost(X, *args):
    """
    Cost function for low rank matrix factorization
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, ratings, mask, regularization_amount)
    :return: The cost with the current P and Q matrices
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args

    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T

    # Calculate current cost
    return (np.sum(np.square(mask * (np.dot(P, Q) - ratings))) / 2) + ((regularization_amount / 2.0) * np.sum(np.square(Q.T))) + ((regularization_amount / 2.0) * np.sum(np.square(P)))


def gradient(X, *args):
    """
    Calculate the cost gradients with the current P and Q.
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, ratings, mask, regularization_amount)
    :return: The gradient with the current X
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args

    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T

    # Calculate the current gradients for both P and Q
    P_grad = np.dot((mask * (np.dot(P, Q) - ratings)), Q.T) + (regularization_amount * P)
    Q_grad = np.dot((mask * (np.dot(P, Q) - ratings)).T, P) + (regularization_amount * Q.T)

    # Return the gradients as one rolled-up array as expected by fmin_cg
    return np.append(P_grad.ravel(), Q_grad.ravel())

def low_rank_matrix_factorization(ratings, mask=None, num_features=15, regularization_amount=0.01):
    """
    Factor a ratings array into two latent feature arrays (user features and product features)

    :param ratings: Matrix with user ratings to factor
    :param mask: A binary mask of which ratings are present in the ratings array to factor
    :param num_features: Number of latent features to generate for users and products
    :param regularization_amount: How much regularization to apply
    :return: (P, Q) - the factored latent feature arrays
    """
    num_users, num_products = ratings.shape

    # If no mask is provided, consider all 'NaN' elements as missing and create a mask.
    if mask is None:
        mask = np.invert(np.isnan(ratings))

    # Replace NaN values with zero
    ratings = np.nan_to_num(ratings)

    # Create P and Q and fill with random numbers to start
    np.random.seed(0)
    P = np.random.randn(num_users, num_features)
    Q = np.random.randn(num_products, num_features)

    # Roll up P and Q into a contiguous array as fmin_cg expects
    initial = np.append(P.ravel(), Q.ravel())

    # Create an args array as fmin_cg expects
    args = (num_users, num_products, num_features, ratings, mask, regularization_amount)

    # Call fmin_cg to minimize the cost function and this find the best values for P and Q
    X = fmin_cg(cost, initial, fprime=gradient, args=args, maxiter=3000)

    # Unroll the new P and new Q arrays out of the contiguous array returned by fmin_cg
    nP = X[0:(num_users * num_features)].reshape(num_users, num_features)
    nQ = X[(num_users * num_features):].reshape(num_products, num_features)

    return nP, nQ.T

def RMSE(real, predicted):
    """
    Calculate the root mean squared error between a matrix of real ratings and predicted ratings
    :param real: A matrix containing the real ratings (with 'NaN' for any missing elements)
    :param predicted: A matrix of predictions
    :return: The RMSE as a float
    """
    return np.sqrt(np.nanmean(np.square(real - predicted)))

Overwriting matrix_factorization_utilities.py


Matrix Factorization will help get latent features that map users to movies. Matrix U (100000,x) has all users with and x latent features explaining related to movies, and Matrix M (x, 9786) has x features mapped to all movies

In [115]:
import matrix_factorization_utilities

In [116]:
# Get latent features by applying matrix factorization
%timeit
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_df.values, num_features=20
                                                                    ,regularization_amount=1.0)

         Current function value: 19248.008173
         Iterations: 3000
         Function evaluations: 4505
         Gradient evaluations: 4505


In [119]:
print("Matrix U dimensions are Users * Latent Features {}".format(U.shape))
print()
print("Matrix M dimensions are Latent Features * Movies {}".format(M.shape))

Matrix U dimensions are Users * Latent Features (406, 20)

Matrix M dimensions are Latent Features * Movies (20, 6489)


In [120]:
# get predicted ratings for all movies and all users
predicted_ratings = np.matmul(U, M)

In [121]:
pd.DataFrame(predicted_ratings).iloc[:10,:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,6.529561,6.107273,2.655404,2.68894,1.819179,1.105279,6.360209,2.989621,0.882227,9.478255
1,8.205389,2.064071,11.343088,4.164517,2.655246,1.356098,0.350194,7.964448,0.96883,6.912034
2,8.623577,6.852702,2.5562,4.468553,4.444841,5.150876,3.31535,4.032819,7.022505,7.291252
3,5.865841,4.119415,1.438179,4.611196,2.814837,5.211955,7.918229,3.985196,2.646161,9.502276
4,8.827784,6.320449,7.562108,4.479909,5.822233,4.422692,2.202824,4.706816,5.052043,6.862984
5,8.369063,6.102296,6.694671,4.650061,7.277228,7.587778,10.18474,5.9714,4.759916,7.405433
6,7.026229,6.546823,7.180021,3.390443,3.955735,8.580651,8.33671,5.783895,6.088349,5.199853
7,8.784519,6.745179,7.790987,4.557083,8.931944,5.296938,1.946016,3.996873,7.837075,5.682703
8,12.117929,10.158814,7.088229,7.548237,8.821674,7.122615,8.92267,7.796847,9.774299,10.019025
9,7.404666,6.226593,2.9195,3.184094,4.599252,6.689983,3.873255,4.268826,6.716377,5.463334


In [232]:
pd.DataFrame(predicted_ratings).iloc[400:,6480:]

Unnamed: 0,6480,6481,6482,6483,6484,6485,6486,6487,6488
400,1.096827,0.407074,1.096827,0.226152,1.919446,0.203941,1.919446,6.486038,2.421064
401,3.170642,4.045716,3.170642,2.24762,5.548623,0.478989,5.548623,8.250339,5.696534
402,1.891539,2.229679,1.891539,1.23871,3.310194,0.659994,3.310194,5.711065,4.997432
403,1.028597,2.451579,1.028597,1.361988,1.800045,0.56137,1.800045,4.423782,5.229516
404,0.483041,3.029055,0.483041,1.682808,0.845322,0.464565,0.845322,4.636793,4.506473
405,1.920167,3.891069,1.920167,2.161705,3.360292,-0.037242,3.360292,4.101625,5.327866


### Make Recommendations to User

Now that we have predicted ratings for all movies and users, we can take an example to check predictions

In [122]:
# lets take random user
user_id_to_search = 100

In [123]:
# Check all movies that were already rated by user
reviewed_movies_df = movie_ratings_df[movie_ratings_df['userId'] == user_id_to_search]
reviewed_movies_df = reviewed_movies_df.join(movies_df, on='movieId')

print('Top Movies previously rated by user {}'.format(user_id_to_search))
reviewed_movies_df[['title', 'genres', 'rating']].sort_values(by=['rating'], ascending=False).head(10)

Top Movies previously rated by user 100


Unnamed: 0,title,genres,rating
13283,Breaking the Waves (1996),Drama|Mystery,10.0
13285,Sling Blade (1996),Drama,10.0
13268,Stealing Beauty (1996),Drama,10.0
13293,Donnie Brasco (1997),Crime|Drama,10.0
13280,One Flew Over the Cuckoo's Nest (1975),Drama,10.0
13261,Dead Man (1995),Drama|Mystery|Western,10.0
13282,"Boot, Das (Boat, The) (1981)",Action|Drama|War,10.0
13295,"Garden of the Finzi-Continis, The (Giardino de...",Drama,8.0
13287,"Crucible, The (1996)",Drama,8.0
13259,Mulholland Falls (1996),Crime|Drama|Thriller,8.0


In [234]:
# we can get movies to recommend by searching predicted ratings for top scores
predicted_user_ratings = predicted_ratings[user_id_to_search-1]
print(len(predicted_user_ratings))
predicted_user_ratings

6489


array([7.60864329, 3.6789525 , 4.14384437, ..., 1.42809017, 6.02782406,
       3.88149412])

In [236]:
# update movies df which has predicted ratings of this user
movies_df.insert(3, 'ratings_user100', predicted_user_ratings, allow_duplicates=True)

In [238]:
# remove movies that user has already rated
recommedation_user10 = movies_df[~movies_df['movieId'].isin(reviewed_movies_df['movieId'])]
recommedation_user10 = recommedation_user10.sort_values(by=['ratings_user100'], ascending=False)

In [242]:
print("Top 10 Movie Recommedations for User {} are".format(user_id_to_search))
recommedation_user10.head(15)

Top 10 Movie Recommedations for User 100 are


Unnamed: 0,movieId,title,genres,ratings_user100
174,223,Clerks (1994),Comedy,14.171472
901,1304,Butch Cassidy and the Sundance Kid (1969),Action|Western,13.796118
830,1230,Annie Hall (1977),Comedy|Romance,12.023721
821,1220,"Blues Brothers, The (1980)",Action|Comedy|Musical,11.707061
397,509,"Piano, The (1993)",Drama|Romance,11.592121
224,288,Natural Born Killers (1994),Action|Crime|Thriller,11.326804
273,349,Clear and Present Danger (1994),Action|Crime|Drama|Thriller,11.288501
1287,1968,"Breakfast Club, The (1985)",Comedy|Drama,11.249835
948,1376,Star Trek IV: The Voyage Home (1986),Adventure|Comedy|Sci-Fi,10.751172
3217,5989,Catch Me If You Can (2002),Crime|Drama,10.673835


### Find SImilar Products/Movies

Movies are similar if they have similar genre/similar user rating etc. We can get this information from its latent features and comparing feature of other movies. Ones with least difference are most similar

In [247]:
# lets take random movie
movie_to_search = 345

In [249]:
# get details of current movie
movies_df.loc[movies_df['movieId'] == movie_to_search, ['movieId','title', 'genres']]

Unnamed: 0,movieId,title,genres
269,345,"Adventures of Priscilla, Queen of the Desert, ...",Comedy|Drama


In [186]:
M= np.transpose(M)

In [244]:
M.shape

(6489, 20)

In [252]:
# latent features of current movie
movie_row_number = movies_df.loc[movies_df['movieId'] == movie_to_search].index
current_movie_features = M[movie_row_number]
print("The latent features of movie id {} are".format(movie_to_search))
print(current_movie_features)

The latent features of movie id 345 are
[[ 0.64806346  0.2849036  -0.2178809  -1.48822599 -1.19943008 -0.937442
   0.21204975  0.49513034  0.09571883 -0.22978002 -0.30044208 -0.69137578
  -0.41385313 -0.97815439  0.68960528 -0.41163416  0.06021185 -0.41445894
  -0.48143502 -0.77943785]]


In [253]:
# get difference in features
difference = M - current_movie_features
difference.shape

(6489, 20)

In [254]:
# take absolute difference and sum to get total_difference
total_difference = np.sum(np.abs(difference), axis=1)

In [256]:
movies_df.insert(4,'difference_score',total_difference, allow_duplicates=True)

In [257]:
sorted_movies_df = movies_df.sort_values(by='difference_score')

In [258]:
print('Top 10 movies similar to movie with Id {} are'.format(movie_to_search))
sorted_movies_df[['title','genres','difference_score']][0:12]

Top 10 movies similar to movie with Id 345 are


Unnamed: 0,title,genres,difference_score
269,"Adventures of Priscilla, Queen of the Desert, ...",Comedy|Drama,0.0
2518,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...",Action|Drama|Romance,5.02856
167,Before Sunrise (1995),Drama|Romance,6.039825
640,Father of the Bride (1950),Comedy,6.076193
4732,Boy A (2007),Crime|Drama,6.276048
5567,Searching for Sugar Man (2012),Documentary,6.310145
5476,Sherlock Holmes: A Game of Shadows (2011),Action|Adventure|Comedy|Crime|Mystery|Thriller,6.643658
4593,Into the Wild (2007),Action|Adventure|Drama,6.699786
823,Full Metal Jacket (1987),Drama|War,6.765646
4479,Zodiac (2007),Crime|Drama|Thriller,6.773765
