# <font color=hotpink> Review Materials </font> 

## <font color=gold> Recommendation systems </font>

## Make pivot table from dataframe

## Show data in browser

In [1]:
import pandas as pd
import numpy as np
import os
import webbrowser

# Read the dataset into a data table using Pandas
df = pd.read_csv("movie_ratings_data_set.csv")

# Convert the running list of user ratings into a matrix using the 'pivot table' function
ratings_df = pd.pivot_table(df,index= 'user_id', columns='movie_id')

# Create a csv file of the data for easy viewing
ratings_df.to_csv("review_matrix.csv", na_rep="")

# Create a web page view of the data for easy viewing
html = ratings_df.to_html(na_rep="")

# Save the html to a temporary file
with open("review_matrix.html", "w") as f:
    f.write(html)

# Open the web page in our web browser
full_filename = os.path.abspath("review_matrix.html")
webbrowser.open("file://{}".format(full_filename))

True

# Create Review Matrix

In [2]:
# Read the dataset into a data table using Pandas
data_table = pd.read_csv("movies.csv", index_col="movie_id")

# Create a web page view of the data for easy viewing
html = data_table.to_html()

# Save the html to a temporary file
with open("movie_list.html", "w") as f:
    f.write(html)

# Open the web page in our web browser
full_filename = os.path.abspath("movie_list.html")
webbrowser.open("file://{}".format(full_filename))

True

# Matrix Factorization Utilities

In [3]:
from scipy.optimize import fmin_cg


def normalize_ratings(ratings):
    """
    Given an array of user ratings, subtract the mean of each product's ratings
    :param ratings: 2d array of user ratings
    :return: (normalized ratings array, the calculated means)
    """
    mean_ratings = np.nanmean(ratings, axis=0)
    return ratings - mean_ratings, mean_ratings


def cost(X, *args):
    """
    Cost function for low rank matrix factorization
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, 
                 
                 ratings, mask, regularization_amount)
    :return: The cost with the current P and Q matrices
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args

    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T

    # Calculate current cost
    return (np.sum(np.square(mask * (np.dot(P, Q) - ratings))) / 2) 
    + ((regularization_amount / 2.0) * np.sum(np.square(Q.T)))
    + ((regularization_amount / 2.0) * np.sum(np.square(P)))


def gradient(X, *args):
    """
    Calculate the cost gradients with the current P and Q.
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, 
                 ratings, mask, regularization_amount)
    :return: The gradient with the current X
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args

    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T

    # Calculate the current gradients for both P and Q
    P_grad = np.dot((mask * (np.dot(P, Q) - ratings)), Q.T) + (regularization_amount * P)
    Q_grad = np.dot((mask * (np.dot(P, Q) - ratings)).T, P) + (regularization_amount * Q.T)

    # Return the gradients as one rolled-up array as expected by fmin_cg
    return np.append(P_grad.ravel(), Q_grad.ravel())


def low_rank_matrix_factorization(ratings, mask=None, num_features=15, regularization_amount=0.01):
    """
    Factor a ratings array into two latent feature arrays (user features and product features)

    :param ratings: Matrix with user ratings to factor
    :param mask: A binary mask of which ratings are present in the ratings array to factor
    :param num_features: Number of latent features to generate for users and products
    :param regularization_amount: How much regularization to apply
    :return: (P, Q) - the factored latent feature arrays
    """
    num_users, num_products = ratings.shape

    # If no mask is provided, consider all 'NaN' elements as missing and create a mask.
    if mask is None:
        mask = np.invert(np.isnan(ratings))

    # Replace NaN values with zero
    ratings = np.nan_to_num(ratings)

    # Create P and Q and fill with random numbers to start
    np.random.seed(0)
    P = np.random.randn(num_users, num_features)
    Q = np.random.randn(num_products, num_features)

    # Roll up P and Q into a contiguous array as fmin_cg expects
    initial = np.append(P.ravel(), Q.ravel())

    # Create an args array as fmin_cg expects
    args = (num_users, num_products, num_features, ratings, mask, regularization_amount)

    # Call fmin_cg to minimize the cost function and this find the best values for P and Q
    X = fmin_cg(cost, initial, fprime=gradient, args=args, maxiter=3000)

    # Unroll the new P and new Q arrays out of the contiguous array returned by fmin_cg
    nP = X[0:(num_users * num_features)].reshape(num_users, num_features)
    nQ = X[(num_users * num_features):].reshape(num_products, num_features)

    return nP, nQ.T


def RMSE(real, predicted):
    """
    Calculate the root mean squared error between a matrix of real ratings and predicted ratings
    :param real: A matrix containing the real ratings (with 'NaN' for any missing elements)
    :param predicted: A matrix of predictions
    :return: The RMSE as a float
    """
    return np.sqrt(np.nanmean(np.square(real - predicted)))

# Find Similar Products

In [4]:
# Load user ratings
df = pd.read_csv('movie_ratings_data_set.csv')

# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_df.as_matrix(),
                                     num_features=15, regularization_amount=1.0)

# Swap the rows and columns of product_features just so it's easier to work with
M = np.transpose(M)

# Choose a movie to find similar movies to. Let's find movies similar to movie #5:
movie_id = 5

# Get movie #1's name and genre
movie_information = movies_df.loc[movie_id]

print("We are finding movies similar to this movie:")
print("Movie title: {}".format(movie_information.title))
print("Genre: {}".format(movie_information.genre))

# Get the features for movie #1 we found via matrix factorization
current_movie_features = M[movie_id - 1]

print("The attributes for this movie are:")
print(current_movie_features)

# The main logic for finding similar movies:

# 1. Subtract the current movie's features from every other movie's features
difference = M - current_movie_features

# 2. Take the absolute value of that difference (so all numbers are positive)
absolute_difference = np.abs(difference)

# 3. Each movie has 15 features. Sum those 15 features to get a total 'difference score' for each movie
total_difference = np.sum(absolute_difference, axis=1)

# 4. Create a new column in the movie list with the difference score for each movie
movies_df['difference_score'] = total_difference

# 5. Sort the movie list by difference score, from least different to most different
sorted_movie_list = movies_df.sort_values('difference_score')

# 6. Print the result, showing the 5 most similar movies to movie_id #1
print("The five most similar movies are:")
print(sorted_movie_list[['title', 'difference_score']][0:5])

         Current function value: 98.010501
         Iterations: 6
         Function evaluations: 52
         Gradient evaluations: 40
We are finding movies similar to this movie:
Movie title: The Big City Judge 2
Genre: legal drama
The attributes for this movie are:
[ 0.88432515 -0.89748365 -1.33937495  0.20892199  0.1807583  -1.40433794
 -0.34252951  0.29970638  0.19030158 -0.443964   -0.96892061 -0.72377655
 -0.73211201 -1.20936379  0.99105328]
The five most similar movies are:
                               title  difference_score
movie_id                                              
5               The Big City Judge 2          0.000000
13                     The Sheriff 3          7.476749
15        We Will Fight Those Aliens          7.962674
3                      The Sheriff 2          8.038197
24              The Big City Judge 3          8.562607


  # This is added back by InteractiveShellApp.init_path()


# Factor Review Matrix

In [5]:
# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_df.as_matrix(),
                                     num_features=15, regularization_amount=0.1)

# Find all predicted ratings by multiplying the U by M
predicted_ratings = np.matmul(U, M)

# Save all the ratings to a csv file
predicted_ratings_df = pd.DataFrame(index=ratings_df.index,
                                    columns=ratings_df.columns,
                                    data=predicted_ratings)
predicted_ratings_df.to_csv("predicted_ratings.csv")

         Current function value: 0.443573
         Iterations: 26
         Function evaluations: 102
         Gradient evaluations: 90


  


# Making Recommendations

In [6]:
# Load user ratings
raw_dataset_df = pd.read_csv('movie_ratings_data_set.csv')

# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id',
                            columns='movie_id',
                            aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_df.as_matrix(),
                                                                    num_features=15,
                                                                    regularization_amount=0.1)

# Find all predicted ratings by multiplying U and M matrices
predicted_ratings = np.matmul(U, M)

print("Enter a user_id to get recommendations (Between 1 and 100):")
user_id_to_search = int(input())

print("Movies previously reviewed by user_id {}:".format(user_id_to_search))

reviewed_movies_df = raw_dataset_df[raw_dataset_df['user_id'] == user_id_to_search]
reviewed_movies_df = reviewed_movies_df.join(movies_df, on='movie_id')

print(reviewed_movies_df[['title', 'genre', 'value']])

input("Press enter to continue.")

print("Movies we will recommend:")

user_ratings = predicted_ratings[user_id_to_search - 1]
movies_df['rating'] = user_ratings

already_reviewed = reviewed_movies_df['movie_id']
recommended_df = movies_df[movies_df.index.isin(already_reviewed) == False]
recommended_df = recommended_df.sort_values(by=['rating'], ascending=False)

print(recommended_df[['title', 'genre', 'rating']].head(5))

  del sys.path[0]


         Current function value: 0.443573
         Iterations: 26
         Function evaluations: 102
         Gradient evaluations: 90
Enter a user_id to get recommendations (Between 1 and 100):
1
Movies previously reviewed by user_id 1:
               title                  genre  value
0      The Sheriff 4   crime drama, western      4
1  Mafia Underground  crime drama, thriller      4
2        Biker Gangs    crime drama, action      4
3      The Sheriff 1   crime drama, western      4
4     The Spy Family              spy drama      4
5      The Sheriff 3   crime drama, western      5
Press enter to continue.
Movies we will recommend:
                               title                     genre    rating
movie_id                                                                
24              The Big City Judge 3               legal drama  8.161112
5               The Big City Judge 2               legal drama  5.637546
15        We Will Fight Those Aliens            sci-fi, action

# Measuring Accuracy

In [7]:
# Load user ratings
raw_training_dataset_df = pd.read_csv('movie_ratings_data_set_training.csv')
raw_testing_dataset_df = pd.read_csv('movie_ratings_data_set_testing.csv')

# Convert the running list of user ratings into a matrix
ratings_training_df = pd.pivot_table(raw_training_dataset_df, index='user_id', 
                                     columns='movie_id', aggfunc=np.max)
ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index='user_id', 
                                    columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_training_df.as_matrix(),
                                    num_features=11, regularization_amount=1.1)

# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)

# Measure RMSE
rmse_training = RMSE(ratings_training_df.as_matrix(), predicted_ratings)
rmse_testing = RMSE(ratings_testing_df.as_matrix(), predicted_ratings)

print("Training RMSE: {}".format(rmse_training))
print("Testing RMSE: {}".format(rmse_testing))

         Current function value: 28.277160
         Iterations: 11
         Function evaluations: 64
         Gradient evaluations: 53
Training RMSE: 0.3450532248275474
Testing RMSE: 2.96547591459201


  if sys.path[0] == '':


# Training Recommender with Cold Start 

In [8]:
import pickle
# Load user ratings
raw_dataset_df = pd.read_csv('movie_ratings_data_set.csv')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

# Normalize the ratings (center them around their mean)
normalized_ratings, means = normalize_ratings(ratings_df.as_matrix())

# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(normalized_ratings, num_features=11, regularization_amount=1.1)

# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)

# Add back in the mean ratings for each product to de-normalize the predicted results
predicted_ratings = predicted_ratings + means

# Save features and predicted ratings to files for later use
pickle.dump(U, open("user_features.dat", "wb"))
pickle.dump(M, open("product_features.dat", "wb"))
pickle.dump(predicted_ratings, open("predicted_ratings.dat", "wb" ))
pickle.dump(means, open("means.dat", "wb" ))

         Current function value: 17.952392
         Iterations: 14
         Function evaluations: 79
         Gradient evaluations: 69


  if __name__ == '__main__':


# Making Recommendation from Cold Start

In [9]:
# Load prediction rules from data files
means = pickle.load(open("means.dat", "rb"))

# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')

# Just use the average movie ratings directly as the user's predicted ratings
user_ratings = means

print("Movies we will recommend:")

movies_df['rating'] = user_ratings
movies_df = movies_df.sort_values(by=['rating'], ascending=False)

print(movies_df[['title', 'genre', 'rating']].head(5))

Movies we will recommend:
                            title                   genre    rating
movie_id                                                           
6               Attack on Earth 1          sci-fi, action  4.900000
10        Surrounded by Zombies 1  horror, zombie fiction  4.882353
3                   The Sheriff 2    crime drama, western  4.818182
12                     Horrorfest                  horror  4.800000
5            The Big City Judge 2             legal drama  4.785714


# Training Recommender

In [10]:
# Load user ratings
raw_dataset_df = pd.read_csv('movie_ratings_data_set.csv')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_df.as_matrix(), num_features=15, regularization_amount=0.1)

# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)

# Save features and predicted ratings to files for later use
pickle.dump(U, open("user_features.dat", "wb"))
pickle.dump(M, open("product_features.dat", "wb"))
pickle.dump(predicted_ratings, open("predicted_ratings.dat", "wb" ))

  


         Current function value: 0.443573
         Iterations: 26
         Function evaluations: 102
         Gradient evaluations: 90


# Making Recommendation

In [11]:
import pickle
import pandas as pd

# Load prediction rules from data files
U = pickle.load(open("user_features.dat", "rb"))
M = pickle.load(open("product_features.dat", "rb"))
predicted_ratings = pickle.load(open("predicted_ratings.dat", "rb"))

# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')

print("Enter a user_id to get recommendations (Between 1 and 100):")
user_id_to_search = int(input())

print("Movies we will recommend:")

user_ratings = predicted_ratings[user_id_to_search - 1]
movies_df['rating'] = user_ratings
movies_df = movies_df.sort_values(by=['rating'], ascending=False)

print(movies_df[['title', 'genre', 'rating']].head(5))

Enter a user_id to get recommendations (Between 1 and 100):
1
Movies we will recommend:
                               title                 genre    rating
movie_id                                                            
24              The Big City Judge 3           legal drama  8.161112
5               The Big City Judge 2           legal drama  5.637546
15        We Will Fight Those Aliens        sci-fi, action  5.274087
13                     The Sheriff 3  crime drama, western  4.968868
2               The Big City Judge 1           legal drama  4.528773


# Finding product similarity

In [12]:
import pickle
import pandas as pd
import numpy as np

# Load prediction rules from data files
M = pickle.load(open("product_features.dat", "rb"))

# Swap the rows and columns of product_features just so it's easier to work with
M = np.transpose(M)

# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')

# Choose a movie to find similar movies to. Let's find movies similar to movie #5:
movie_id = 5

# Get movie #1's name and genre
movie_information = movies_df.loc[movie_id]

print("We are finding movies similar to this movie:")
print("Movie title: {}".format(movie_information.title))
print("Genre: {}".format(movie_information.genre))

# Get the features for movie #1 we found via matrix factorization
current_movie_features = M[movie_id - 1]

print("The attributes for this movie are:")
print(current_movie_features)

# The main logic for finding similar movies:

# 1. Subtract the current movie's features from every other movie's features
difference = M - current_movie_features

# 2. Take the absolute value of that difference (so all numbers are positive)
absolute_difference = np.abs(difference)

# 3. Each movie has several features. Sum those features to get a total 'difference score' for each movie
total_difference = np.sum(absolute_difference, axis=1)

# 4. Create a new column in the movie list with the difference score for each movie
movies_df['difference_score'] = total_difference

# 5. Sort the movie list by difference score, from least different to most different
sorted_movie_list = movies_df.sort_values('difference_score')

# 6. Print the result, showing the 5 most similar movies to movie_id #1
print("The five most similar movies are:")
print(sorted_movie_list[['title', 'difference_score']][0:5])


We are finding movies similar to this movie:
Movie title: The Big City Judge 2
Genre: legal drama
The attributes for this movie are:
[ 0.87011509 -1.0351991  -1.30446262  0.10657639  0.07166534 -1.42680369
 -0.28336831  0.40711148  0.08468894 -0.44296559 -1.13236308 -0.67210737
 -0.80658773 -1.17878696  1.00120923]
The five most similar movies are:
                               title  difference_score
movie_id                                              
5               The Big City Judge 2          0.000000
13                     The Sheriff 3          7.309342
15        We Will Fight Those Aliens          7.653577
24              The Big City Judge 3          8.076881
2               The Big City Judge 1          8.652308


# Extra Notes

***
***.as_matrix():** make sure that the data is passed as a numpy matrix

***
Save features and predicted ratings to files for later use
<br><br>This helps to save large matrix U faster.
<br><br><font color=green>pickle.dump(U, open("user_features.dat", "wb"))

***

#### <font color=green>import pickle 
import pandas as pd </font>
<br>
Load prediction rules from data files
<br>
<font color=green>means = pickle.load(open("means.dat", "rb"))</font>
<br>
The pickle feature is useful in running the recommendation on pre-saved model data .

***
**MovieLens DataSet**

### Main Idea:
First, convert the data into meaningful table using pd.read_csv and pd.pivot_table. 
<br> Then instead of finding features of each movie by yourself, do low_rank_matrix_factorization to automatically come up with features of movies. In other words, data = U * M. 
<br> Factor a ratings array into two latent feature arrays (user features and product features) as in the following:

In [13]:
def low_rank_matrix_factorization(ratings, mask=None, num_features=15, regularization_amount=0.01):
    """
    Factor a ratings array into two latent feature arrays (user features and product features)

    :param ratings: Matrix with user ratings to factor
    :param mask: A binary mask of which ratings are present in the ratings array to factor
    :param num_features: Number of latent features to generate for users and products
    :param regularization_amount: How much regularization to apply
    :return: (P, Q) - the factored latent feature arrays
    """

This can be done by calculating U and M repeatedly using

In [14]:
from scipy.optimize import fmin_cg

which minimizes the error using gradient calculation. 

Then:
<br>
predicted_ratings = np.matmul(U, M)

Then, for a certain user_ID, one can sort the predicted_ratings and suggest the top five rating for the user. 

***
If a user doesn't have reviews so far, we can either:
<br>
Not suggest anything, or suggest the top rated movies, or suggest related movies.

Related movies are the ones with the most similar M matrix. Also, don't forget regularization.