<a href="https://colab.research.google.com/github/manashpratim/Recommender-System/blob/main/Recommender_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [99]:
import pandas as pd
import webbrowser
import os
import numpy as np
from scipy.optimize import fmin_cg

## **Helper Functions**

In [100]:
#Function to view the data in a browser
#This function will work when using the notebook locally 
def web_viewer(dataframe,filename,resolve_na=False):
  
  #Creating web page view of the first of data
  if not resolve_na:
    html = dataframe.to_html()
  else:
    html = data.frame.to_html(na_rep="")

  with open(filename, "w") as f:
    f.write(html)

  full_filename = os.path.abspath(filename)
  webbrowser.open("file://{}".format(full_filename))
#The ratings matrix have NaN values for missing entries
def normalize_ratings(ratings):
    mean_ratings = np.nanmean(ratings, axis=0)
    return ratings - mean_ratings, mean_ratings

def RMSE(true, predicted):
    return np.sqrt(np.nanmean(np.square(true - predicted)))


def cost(X,*args):

    num_users, num_products, num_features, ratings, mask, reg_amt = args
    U = X[0:(num_users * num_features)].reshape(num_users, num_features)
    M = (X[(num_users * num_features):].reshape(num_products, num_features)).T

    # Calculate current cost
    curr_cost = (np.sum(np.square(mask * (np.dot(U, M) - ratings))) / 2) + ((reg_amt / 2.0) * np.sum(np.square(M.T))) + ((reg_amt / 2.0) * np.sum(np.square(U)))
    return curr_cost


def gradient(X, *args):

    num_users, num_products, num_features, ratings, mask, reg_amt = args

    U = X[0:(num_users * num_features)].reshape(num_users, num_features)
    M = (X[(num_users * num_features):].reshape(num_products, num_features)).T

    # Calculate the current gradients for both U and M
    grad_U = np.dot((mask * (np.dot(U, M) - ratings)), M.T) + (reg_amt * U)
    grad_M = np.dot((mask * (np.dot(U, M) - ratings)).T, U) + (reg_amt * M.T)

    # Return the gradients as one rolled-up array as expected by fmin_cg
    return np.append(grad_U.ravel(), grad_M.ravel())

def low_rank_matrix_factorization(ratings, mask=None, num_features=15, reg_amt=0.01):
   
    num_users, num_products = ratings.shape

    # If no mask is provided, consider all 'NaN' elements as missing and create a mask.
    if mask is None:
        mask = np.invert(np.isnan(ratings))

    # Replace NaN values with zero
    ratings = np.nan_to_num(ratings)

    # Create U and M and fill with random numbers to start
    np.random.seed(0)
    U = np.random.randn(num_users, num_features)
    M = np.random.randn(num_products, num_features)

    # Roll up U and M into a contiguous array as fmin_cg expects
    initial = np.append(U.ravel(), M.ravel())

    # Create an args array as fmin_cg expects
    args = (num_users, num_products, num_features, ratings, mask, reg_amt)

    # Call fmin_cg to minimize the cost function and this find the best values for U and M
    X = fmin_cg(cost, initial, fprime=gradient, args=args, maxiter=5000)

    # Unroll the new U and new M arrays out of the contiguous array returned by fmin_cg
    nU = X[0:(num_users * num_features)].reshape(num_users, num_features)
    nM = X[(num_users * num_features):].reshape(num_products, num_features)

    return nU, nM.T

In [101]:
movie_ratings_df = pd.read_csv("movie_ratings_data_set.csv")
web_viewer(movie_ratings_df,'movie_ratings.html')

In [102]:
# We can see that each user is repeated multiple times for all the movies that the user has reviewed
movie_ratings_df.head(3)

Unnamed: 0,user_id,movie_id,value
0,1,28,4
1,1,26,4
2,1,9,4


In [103]:
# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')
movies_df.head(3)

Unnamed: 0_level_0,title,genre
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The Sheriff 1,"crime drama, western"
2,The Big City Judge 1,legal drama
3,The Sheriff 2,"crime drama, western"


In [104]:
# Convert the dataframe of user ratings into a matrix using the 'pivot table' function
# The rows of the matrix will denote an unique user and the columns would denote the moviews that the user reviewed
# If an user rated a movie multiple times, we take the mean of the ratings
ratings_df = pd.pivot_table(movie_ratings_df, index='user_id', columns='movie_id', aggfunc=np.mean)

#The NaN values indicate the movies that are not reviewed by the user. We have a very sparse dataset. 
#Our goal is to fill the missing data with the data that we already have using a recommendation system 
ratings_df.head(3)

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2
1,4.0,,,,,,,,4.0,,,,5.0,4.0,,,,,,,,,,,,4.0,,4.0,,,,,,
2,5.0,5.0,,,,,,,,,,,,4.0,4.0,,,,,,5.0,,,,,,,,,,3.0,,,4.0
3,4.0,4.0,5.0,,,,,,,,,,5.0,,,,,,,,,3.0,3.0,,,,,,,,,,,


In [105]:
# Normalize the ratings (center them around their mean)
normalized_ratings, means = normalize_ratings(ratings_df.values)
U, M = low_rank_matrix_factorization(normalized_ratings,num_features=15,reg_amt=0.01)

Optimization terminated successfully.
         Current function value: 1.107595
         Iterations: 4436
         Function evaluations: 6586
         Gradient evaluations: 6586


In [106]:
predicted_ratings = np.matmul(U, M)
predicted_ratings += means
# Save all the ratings to a csv file
predicted_ratings_df = pd.DataFrame(index=ratings_df.index,
                                    columns=ratings_df.columns,
                                    data=predicted_ratings)
predicted_ratings_df.to_csv("predicted_ratings.csv")

In [107]:
predicted_ratings_df.head(3)

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2
1,4.002306,4.184191,4.629057,4.348983,4.71953,4.814144,4.371344,4.229079,4.003538,4.397279,4.328557,4.981583,4.998033,4.001564,4.852226,4.716521,2.468213,3.508564,3.098644,3.894102,4.005289,4.138768,3.55767,4.520127,4.434809,4.002598,4.000889,4.000275,3.486506,3.909822,4.163651,4.496829,3.533359,4.052764
2,4.998644,4.998318,4.961577,4.781617,4.996043,4.656861,3.907244,4.94876,4.909227,4.831372,4.579579,4.63764,4.511412,4.001575,4.001686,4.745008,2.381333,2.565666,4.022234,4.39465,4.998848,4.630373,3.886025,4.766432,4.694462,4.751537,4.325124,4.50602,3.34006,3.362674,2.999699,4.33761,3.876965,4.000594
3,4.000961,4.001856,4.999867,4.702083,4.879474,4.798235,4.008501,4.419289,4.499367,4.646629,4.111242,4.783841,4.998131,4.849097,4.397691,4.535902,2.47456,4.596389,3.68046,3.471708,4.05099,3.00393,3.001134,4.345507,4.395478,4.350102,4.019421,4.804271,3.820957,3.211619,4.227711,3.616436,2.609299,4.874179


## **Recommending similar movies based on a movie that the user just watched**

In [109]:
M_t = M.T

# Choose a movie to find similar movies to. Let's find movies similar to movie #5:
movie_id = 5

# Get movie #1's name and genre
movie_information = movies_df.loc[movie_id]

print("We are finding movies similar to this movie:")
print("Movie title: {}".format(movie_information.title))
print("Genre: {}".format(movie_information.genre))

# Get the features for movie #1 we found via matrix factorization
current_movie_features = M_t[movie_id - 1]

#Subtract the current movie's features from every other movie's features and take absolute value
absolute_difference = np.abs(M_t - current_movie_features)

#Each movie has 15 features. Sum those 15 features to get a total 'difference score' for each movie
total_difference = np.sum(absolute_difference, axis=1)

#Create a new column in the movie list with the difference score for each movie
movies_df['difference_score'] = total_difference

# Sort the movie list by difference score, from least different to most different
sorted_movie_list = movies_df.sort_values('difference_score')

#Print the result, showing the 5 most similar movies to movie_id #1
print("The five most similar movies are:")
print(sorted_movie_list[['title', 'difference_score']][1:6])


We are finding movies similar to this movie:
Movie title: The Big City Judge 2
Genre: legal drama
The five most similar movies are:
                             title  difference_score
movie_id                                            
8         Sci-Fi Murder Detectives          3.083391
11               Inspector Jackson          3.935104
24            The Big City Judge 3          3.960842
9                      Biker Gangs          4.534090
26               Mafia Underground          4.535437


##**Recommending Movies to a User** 

In [110]:
user_id_to_search = 5

print("Movies previously reviewed by user_id {}:".format(user_id_to_search))

reviewed_movies_df = movie_ratings_df[movie_ratings_df['user_id'] == user_id_to_search]
reviewed_movies_df = reviewed_movies_df.join(movies_df, on='movie_id')

print(reviewed_movies_df[['title', 'genre', 'value']])
print('\n')
print("Movies we will recommend:")

user_ratings = predicted_ratings[user_id_to_search-1]
movies_df['rating'] = user_ratings

already_reviewed = reviewed_movies_df['movie_id']
recommended_df = movies_df[movies_df.index.isin(already_reviewed) == False]
recommended_df = recommended_df.sort_values(by=['rating'], ascending=False)

print(recommended_df[['title', 'genre', 'rating']].head(5))

Movies previously reviewed by user_id 5:
                        title                      genre  value
25      My Complicated Family               comedy-drama      3
26          Behind the Scenes               comedy-drama      2
27              The Sheriff 1       crime drama, western      5
28      The Serious Detective            detective drama      5
29  Fake News about Fake News             satire, comedy      5
30               Sports Nerds                     comedy      5
31         Post-Apocalyptia 1  sci-fi, thriller, mystery      3
32                Biker Gangs        crime drama, action      5
33            Political Gaffs   comedy, political satire      4


Movies we will recommend:
                            title                          genre    rating
movie_id                                                                  
10        Surrounded by Zombies 1         horror, zombie fiction  5.254022
14                 The Spy Family                      spy drama  

## **Measuring Accuracy**

In [111]:
# Load user ratings
raw_training_dataset_df = pd.read_csv('movie_ratings_data_set_training.csv')
raw_testing_dataset_df = pd.read_csv('movie_ratings_data_set_testing.csv')

# Convert the running list of user ratings into a matrix
ratings_training_df = pd.pivot_table(raw_training_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)
ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_training_df.values,num_features=15,reg_amt=1.1)

# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)

# Measure RMSE
rmse_training = RMSE(ratings_training_df.values, predicted_ratings)
rmse_testing = RMSE(ratings_testing_df.values, predicted_ratings)

print("Training RMSE: {}".format(rmse_training))
print("Testing RMSE: {}".format(rmse_testing))


Optimization terminated successfully.
         Current function value: 315.538580
         Iterations: 796
         Function evaluations: 1195
         Gradient evaluations: 1195
Training RMSE: 0.2495249606551651
Testing RMSE: 1.2096469556105187
