In [None]:
%config Completer.use_jedi = False

# Read data

In [None]:
import pandas as pd
import numpy as np

In [None]:
movies = pd.read_csv('./movies.csv')
ratings = pd.read_csv('./ratings.csv')

print("movies samples: {}, features: {}".format(movies.shape[0], movies.shape[1]))
print("ratings shape: {}, features: {}".format(ratings.shape[0], ratings.shape[1]))

In [None]:
movies.head(n=3)

In [None]:
ratings.head(n=3)

# Pivot ratings -> userId - movieId

In [None]:
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table(values='rating', index='userId', columns='movieId')
ratings_matrix.head()

In [None]:
# Turn numbers into titles
ratings_movies = ratings.merge(movies, left_on='movieId', right_on='movieId')
ratings_matrix = ratings_movies.pivot_table(values='rating', index='userId', columns='title')

# Fill Nan to 0
ratings_matrix.fillna(0, inplace=True)
ratings_matrix.head()

# Cosine similarity matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Transpose ratings_matrix
ratings_matrix_T = ratings_matrix.T

# Calculate cosine similarity array
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)

# Change numpy array to pandas dataframe
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns, columns=ratings_matrix.columns)

In [None]:
item_sim_df

## Personalized recommend system: based items k-nearest collaborative filtering

- $\hat{R}_{u, i}$: 영화 $i$에 대한 사용자 $u$의 평점 예측값
- $S_{i, N}$: 영화 $i$와 가장 유사도가 높은 top-$N$ 개 영화의 유사도 벡터
- $R_{u, N}$: 사용자 $u$의 영화 $i$와 가장 유사도가 높은 top-$N$개 영황에 대한 실제 평점 벡터


$\hat{R}_{u, i}$ = $\sum_{}^{N} S_{i, N} \cdot R_{u, N}$ \ $\sum_{}^{N} |S_{i, N}|$

In [None]:
def predict_rating(ratings_arr, item_sim_arr):
    """Return a predicted movie rating of an user
    ratings_arr: numpy array, rows are users and columns are moives, values are ratings,
    item_sim_arr: numpy array, rows and columns are movie titles, values are cosine similarities"""
    ratings_pred = ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr.sum(axis=0))])
    return ratings_pred

In [None]:
ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns)
ratings_pred_matrix.head(n=3)

# Evaluate

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
def get_mse(pred, actual):
    """Return a mean square error
    pred: numpy array, predicted values,
    actual: numpy array, ground truth values"""
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [None]:
f'A mean square error of result: {get_mse(ratings_pred, ratings_matrix.values):0.4f}'

In [None]:
from tqdm import tqdm

In [None]:
def predict_rating_top_sim(ratings_arr, item_sim_arr, n=20):
    """Return a predicted movie rating of an user
    ratings_arr: numpy array, rows are users and columns are moives, values are ratings,
    item_sim_arr: numpy array, rows and columns are movie titles, values are cosine similarities,
    n: integer, n higher cosine similarities"""
    pred = np.zeros(ratings_arr.shape)
    
    for col in tqdm(range(ratings_arr.shape[1])):
        # Return indexes with n high cosine similarities in the item_sim_arr
        top_n_items = np.argsort(item_sim_arr[:, col])[:-(n+1):-1].tolist()
        # Calculate personalized ratings
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, top_n_items].dot(ratings_arr[row, top_n_items].T)
            pred[row, col] /= abs(item_sim_arr[col, top_n_items]).sum()
    return pred

In [None]:
ratings_pred = predict_rating_top_sim(ratings_matrix.values, item_sim_df.values, n=20)

In [None]:
f'A mean square error of result: {get_mse(ratings_pred, ratings_matrix.values):0.4f}'

In [None]:
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns)

In [None]:
user_rating_id = ratings_matrix.loc[9, :]
user_rating_id[user_rating_id > 0].sort_values(ascending=False)[:10].tolist()

In [None]:
def get_unseen_movies(ratings_matrix, user_id):
    """Return a list of not seen movies indexes
    ratings_matrix: numpy array, rows are users and columns are moives, values are ratings,
    user_id: integer, an user id number"""
    
    # Return all movies of an user
    user_rating = ratings_matrix.loc[user_id, :]
    
    # Return a list of not seen movies indexes
    unseen_list = user_rating[user_rating < 1].index.tolist()
    
    return unseen_list

In [None]:
def recomm_movie_by_user_id(pred_df, user_id, unseen_list, top_n=10):
    """Recommend top n not seen movies by predicted higher rating"""
    recomm_movies = pred_df.loc[user_id, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [None]:
# not seend movies list of the user id 9
unseen_list = get_unseen_movies(ratings_matrix, 9)

# recommend movies
recomm_movies = recomm_movie_by_user_id(ratings_pred_matrix, 9, unseen_list, top_n=10)
recomm_movies = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index, columns=['pred_score'])

In [None]:
recomm_movies