In [None]:
%config Completer.use_jedi = False

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [None]:
def get_rmse(R, P, Q, non_zeros):
    """Calculate and return a positive root of MSE with R and predicted R
    R: original data,
    P, Q: factorization matrix,
    non_zeros: non zero element and (x, y) coordinate of R"""

    # Generate a predicted R with P, Q
    full_pred_matrix = np.dot(P, Q.T)
    
    # Calculate RMSE with R and predicted R
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [None]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda=0.01):
    """R: user-item rating,
    K: demension of potential factor,
    steps: the number of SGD iteration,
    learning_rate: learning rate,
    r_lambda: L2 regularization"""
    
    num_users, num_items = R.shape
    
    # Set demensions and random values of P and Q
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))
    
    prev_rmse = 10000
    break_count = 0
    
    # Save non zero elements in R
    non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]
    
    # Update P and Q using SGD method
    for step in range(steps):
        for i, j, r in non_zeros:
            # Calculate erros
            e_ij = r - np.dot(P[i, :], Q[j, :].T)
            # Apply the L2 regularization and update
            P[i, :] = P[i, :] + learning_rate * (e_ij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + learning_rate * (e_ij * P[i, :] - r_lambda * Q[j, :])
        
        rmse = get_rmse(R, P, Q, non_zeros)
        if step % 10 == 0:
            print("### iteration step: ", step, " rmse: ", rmse)
            
    return P, Q

In [None]:
movies = pd.read_csv("./movies.csv")
ratings = pd.read_csv("./ratings.csv")
ratings = ratings[["userId", "movieId", "rating"]]
ratings_matrix = ratings.pivot_table("rating", index="userId", columns="movieId")

In [None]:
ratings_matrix

In [None]:
# Merge matrices ratings and movies
rating_movies = pd.merge(ratings, movies, on="movieId")

# Pivot
ratings_matrix = rating_movies.pivot_table("rating", index="userId", columns="title")

In [None]:
P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda=0.01)
pred_matrix = np.dot(P, Q.T)

In [None]:
ratings_pred_matrix = pd.DataFrame(pred_matrix, index=ratings_matrix.index, columns=ratings_matrix.columns)
ratings_pred_matrix

In [None]:
def get_unseen_movies(ratings_matrix, user_id):
    """Return a list of not seen movies indexes
    ratings_matrix: numpy array, rows are users and columns are moives, values are ratings,
    user_id: integer, an user id number"""
    
    # Return all movies of an user
    user_rating = ratings_matrix.loc[user_id, :]
    
    # Return a list of not seen movies indexes
    already_seen = user_rating[user_rating > 0].index.tolist()
    
    # All of movies title
    movies_list = ratings_matrix.columns.tolist()
    
    # Unseen movies list
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [None]:
# Extract unseen movies
unseen_list = get_unseen_movies(ratings_matrix, 9)

In [None]:
def recomm_movie_by_userid(pred_df, user_id, unseen_list, top_n=10):
    """Recommend top n not seen movies by predicted higher rating"""
    recomm_movies = pred_df.loc[user_id, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [None]:
# Recommend movies with potential collaborative filtering
recomm_movies = recomm_movie_by_user_id(ratings_pred_matrix, 9, unseen_list, top_n=10)

In [None]:
# Generate a rating data to a DataFrame
recomm_movies = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index, columns=["pred_scorre"])
recomm_movies