In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
movies = pd.read_csv('../data/ml-32m/movies.csv')
reviews = pd.read_csv('../data/ml-32m/ratings.csv')

In [126]:
# Generate the review matrix
movies_reduced = movies[movies['movieId'] <= 10]  # Limit to first 10 movies
reviews_reduced = reviews[(reviews['movieId'] <= 10) & (reviews['userId'] <= 10000)]  # Limit to first 10 movies and 1000 users 

review_matrix = reviews_reduced.pivot(index='userId', columns='movieId', values='rating')
print(review_matrix)

movieId   1    2    3   4    5    6    7   8   9    10
userId                                                
3        NaN  3.5  NaN NaN  NaN  NaN  NaN NaN NaN  4.0
5        NaN  NaN  NaN NaN  NaN  NaN  NaN NaN NaN  4.0
10       2.5  2.0  NaN NaN  NaN  NaN  NaN NaN NaN  4.0
11       3.0  NaN  3.0 NaN  3.0  3.0  NaN NaN NaN  NaN
13       NaN  NaN  NaN NaN  NaN  5.0  NaN NaN NaN  NaN
...      ...  ...  ...  ..  ...  ...  ...  ..  ..  ...
9994     3.0  NaN  NaN NaN  NaN  NaN  NaN NaN NaN  NaN
9995     NaN  NaN  4.0 NaN  NaN  NaN  NaN NaN NaN  NaN
9996     5.0  NaN  NaN NaN  5.0  3.0  NaN NaN NaN  NaN
9997     3.0  NaN  4.0 NaN  5.0  NaN  5.0 NaN NaN  NaN
9998     4.0  NaN  NaN NaN  NaN  NaN  NaN NaN NaN  NaN

[5075 rows x 10 columns]


In [128]:
# Calculate cosine similarity between movies
review_matrix_t = review_matrix.T
movie_similarity = cosine_similarity(review_matrix_t.fillna(0))
movie_similarity_df = pd.DataFrame(movie_similarity, index=review_matrix_t.index, columns=review_matrix_t.index)
movie_similarity_df

# Notes:
# Pros:
# - Captures patterns of ratings accross users. Similar patterns suggest similar audience appeal.
# - Magnitude of ratings doesn't matter, just the pattern. Good because different users have different rating scales.
# - Not good that users who haven't watched the movie influence the similarity
# - 

movieId,1,2,3,4,5,6,7,8,9,10
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1.0,0.384273,0.27074,0.122857,0.253194,0.355148,0.271155,0.083649,0.153061,0.361349
2,0.384273,1.0,0.211616,0.143387,0.219717,0.266967,0.194206,0.157,0.11729,0.368971
3,0.27074,0.211616,1.0,0.184589,0.420742,0.27668,0.373259,0.10703,0.235415,0.210109
4,0.122857,0.143387,0.184589,1.0,0.183479,0.13324,0.184083,0.101061,0.18764,0.12129
5,0.253194,0.219717,0.420742,0.183479,1.0,0.217992,0.390267,0.127311,0.210208,0.187518
6,0.355148,0.266967,0.27668,0.13324,0.217992,1.0,0.251391,0.072457,0.220216,0.3574
7,0.271155,0.194206,0.373259,0.184083,0.390267,0.251391,1.0,0.096657,0.182866,0.193409
8,0.083649,0.157,0.10703,0.101061,0.127311,0.072457,0.096657,1.0,0.103244,0.108149
9,0.153061,0.11729,0.235415,0.18764,0.210208,0.220216,0.182866,0.103244,1.0,0.164262
10,0.361349,0.368971,0.210109,0.12129,0.187518,0.3574,0.193409,0.108149,0.164262,1.0


In [130]:
# Calculate cosine similarity between users
user_similarity = cosine_similarity(review_matrix.fillna(0))
user_similarity_df = pd.DataFrame(user_similarity, index=review_matrix.index, columns=review_matrix.index)
user_similarity_df

userId,3,5,10,11,13,17,19,20,23,24,...,9984,9985,9988,9991,9993,9994,9995,9996,9997,9998
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1.000000,0.752577,0.844606,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.433628,0.532152,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
5,0.752577,1.000000,0.780720,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.707107,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
10,0.844606,0.780720,1.000000,0.243975,0.000000,0.345033,0.487950,0.487950,0.487950,0.487950,...,0.487950,0.487950,0.624274,0.897085,0.292770,0.487950,0.00000,0.317628,0.169031,0.487950
11,0.000000,0.000000,0.243975,1.000000,0.500000,0.707107,0.500000,0.500000,0.500000,0.500000,...,0.500000,0.500000,0.376288,0.353553,0.700000,0.500000,0.50000,0.846228,0.692820,0.500000
13,0.000000,0.000000,0.000000,0.500000,1.000000,0.707107,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.800000,0.000000,0.00000,0.390567,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,0.000000,0.000000,0.487950,0.500000,0.000000,0.707107,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,0.752577,0.707107,0.600000,1.000000,0.00000,0.650945,0.346410,1.000000
9995,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.00000,0.000000,0.461880,0.000000
9996,0.000000,0.000000,0.317628,0.846228,0.390567,0.736460,0.650945,0.650945,0.650945,0.650945,...,0.650945,0.650945,0.489886,0.460287,0.703020,0.650945,0.00000,1.000000,0.601317,0.650945
9997,0.000000,0.000000,0.169031,0.692820,0.000000,0.244949,0.346410,0.346410,0.346410,0.346410,...,0.346410,0.346410,0.260700,0.244949,0.207846,0.346410,0.46188,0.601317,1.000000,0.346410


In [131]:
# Finding most similar users to a given user
given_user_id = 3
most_similar_users = user_similarity_df[given_user_id].sort_values(ascending=False).head(10)
most_similar_users

userId
3       1.000000
6289    1.000000
8046    0.999969
6293    0.999969
5567    0.999948
1181    0.999948
495     0.999710
9056    0.999028
6318    0.999028
3882    0.997785
Name: 3, dtype: float64

In [None]:
# Calculate rating predictions based on similar users
def predict_ratings_user_based(user_id, movie_id, review_matrix, user_similarity_df, k=10):
    """
    Predict a user's rating for a movie using collaborative filtering with k-nearest neighbors, i.e. by looking at how the k most similar who have watched the movie rated it.
    
    Parameters:
        user_id (int): ID of the target user
        movie_id (int): ID of the movie to predict rating for
        review_matrix (pd.DataFrame): Matrix of user-movie ratings where rows are users, columns are movies
        user_similarity_df (pd.DataFrame): Matrix of user-user similarities
        k (int, optional): Number of similar users to consider. Defaults to 10
    
    Returns:
        float: Predicted rating for the target movie by the target user
        
    Algorithm:
        1. Calculate target user's average rating
        2. Find k most similar users who rated the target movie
        3. Calculate rating offset for similar users (deviation from their mean rating)
        4. Predict rating as user's mean + weighted sum of neighbors' rating offsets
    """

    # calculate average rating of the target user
    user_reviews = review_matrix.loc[user_id]
    average_user_review = user_reviews[user_reviews != 0].mean() 

    # find k most similar users who have rated the target movie
    knn = user_similarity_df[user_id].sort_values(ascending=False).drop(user_id) # most similar users excluding self
    knn = knn[review_matrix[movie_id] != 0]  # only users who have rated the target movie
    knn = knn.head(k)

    # get the rating offsets of these similar users for the target movie (their rating minus their average rating)
    all_neighbor_reviews = review_matrix.loc[knn.index] # all reviews made by neighbors
    movie_neighbor_reviews = all_neighbor_reviews[movie_id] # reviews for the target movie by neighbors
    neighbor_review_offset = movie_neighbor_reviews - all_neighbor_reviews[all_neighbor_reviews != 0].mean(axis=1)

    # calculate predicted rating
    nominator = np.sum(knn * neighbor_review_offset) # sum of similar user rating offsets weighted by similarity
    denominator = np.sum(knn) # sum of similar user similarities
    return average_user_review + (nominator / denominator)


predicted_rating = predict_ratings(user_id=5, movie_id=10, review_matrix=review_matrix, user_similarity_df=user_similarity_df, k=10)
predicted_rating

movieId   1    2    3   4    5    6    7   8   9    10
userId                                                
3        NaN  3.5  NaN NaN  NaN  NaN  NaN NaN NaN  4.0
5        NaN  NaN  NaN NaN  NaN  NaN  NaN NaN NaN  4.0
10       2.5  2.0  NaN NaN  NaN  NaN  NaN NaN NaN  4.0
11       3.0  NaN  3.0 NaN  3.0  3.0  NaN NaN NaN  NaN
13       NaN  NaN  NaN NaN  NaN  5.0  NaN NaN NaN  NaN
...      ...  ...  ...  ..  ...  ...  ...  ..  ..  ...
9994     3.0  NaN  NaN NaN  NaN  NaN  NaN NaN NaN  NaN
9995     NaN  NaN  4.0 NaN  NaN  NaN  NaN NaN NaN  NaN
9996     5.0  NaN  NaN NaN  5.0  3.0  NaN NaN NaN  NaN
9997     3.0  NaN  4.0 NaN  5.0  NaN  5.0 NaN NaN  NaN
9998     4.0  NaN  NaN NaN  NaN  NaN  NaN NaN NaN  NaN

[5075 rows x 10 columns]


np.float64(4.0)

In [123]:


def predict_rating_user_based(user_id, item_id, ratings_matrix, similarity_matrix, k=5):
    """
    Predicts the rating a user would give to an item using user-based collaborative filtering (mean-centered).

    Parameters
    ----------
    user_id : int or str
        Target user for the prediction.
    item_id : int or str
        Target item for which we want to predict the rating.
    ratings_matrix : pd.DataFrame
        A user–item matrix (rows = users, columns = items, entries = ratings, NaN = unrated).
    similarity_matrix : pd.DataFrame
        A user–user similarity matrix with the same row/column labels as ratings_matrix rows.
    k : int
        Number of most similar users to consider (top-k neighbors).

    Returns
    -------
    float
        Predicted rating for (user_id, item_id). Returns np.nan if insufficient data.
    """

    # 1. Find users who rated the target item
    item_ratings = ratings_matrix[item_id]
    users_who_rated = item_ratings[item_ratings.notna()].index

    # 2. Get similarities between the target user and those users
    sims = similarity_matrix.loc[user_id, users_who_rated]

    # 3. Select top-k most similar users
    top_k_users = sims.nlargest(k).index
    top_k_sims = sims.loc[top_k_users]
    top_k_ratings = item_ratings.loc[top_k_users]

    # 4. Compute mean-centered ratings
    user_means = ratings_matrix.loc[top_k_users].mean(axis=1)

    # Deviations from each neighbor’s mean
    rating_devs = top_k_ratings - user_means

    # 5. Compute the weighted average of deviations
    if np.sum(np.abs(top_k_sims)) == 0:
        return np.nan

    numerator = np.sum(top_k_sims * rating_devs)
    denominator = np.sum(np.abs(top_k_sims))

    # Add the target user's mean rating back
    user_mean = ratings_matrix.loc[user_id].mean()
    prediction = user_mean + numerator / denominator

    return prediction

predicted_rating = predict_rating_user_based(user_id=5, item_id=10, ratings_matrix=review_matrix.replace(0, np.nan), similarity_matrix=user_similarity_df, k=10)
predicted_rating

np.float64(4.0)

In [124]:

def predict_rating_item_based(user_id, item_id, ratings_matrix, similarity_matrix, k=5):
    """
    Predicts the rating a user would give to an item using item-based collaborative filtering.

    Parameters
    ----------
    user_id : int or str
        The user for whom we want to predict the rating.
    item_id : int or str
        The target item for which we want a predicted rating.
    ratings_matrix : pd.DataFrame
        A user–item matrix where rows = users, columns = items, and entries = ratings.
        Unrated items should be NaN.
    similarity_matrix : pd.DataFrame
        An item–item similarity matrix with the same item labels as ratings_matrix columns.
    k : int
        The number of most similar items to consider (top-k neighbors).

    Returns
    -------
    float
        Predicted rating for (user_id, item_id). Returns np.nan if insufficient data.
    """

    # 1. Get all items rated by the target user
    user_ratings = ratings_matrix.loc[user_id]
    rated_items = user_ratings[user_ratings.notna()].index

    # 2. Get similarities between the target item and all items the user has rated
    sims = similarity_matrix.loc[item_id, rated_items]

    # 3. Select top-k most similar items
    top_k_items = sims.nlargest(k).index
    top_k_sims = sims.loc[top_k_items]
    top_k_ratings = user_ratings.loc[top_k_items]

    # 4. Compute weighted average (mean-centered version)
    # If no similar items, return NaN
    if top_k_sims.sum() == 0:
        return np.nan

    # Optional: mean-centering around user's mean
    user_mean = user_ratings.mean()

    # Weighted sum of deviations
    numerator = np.sum(top_k_sims * (top_k_ratings - user_mean))
    denominator = np.sum(np.abs(top_k_sims))

    prediction = user_mean + numerator / denominator

    return prediction


# Example usage:
predicted_rating_item = predict_rating_item_based(
    user_id=5,
    item_id=10,
    ratings_matrix=review_matrix.replace(0, np.nan),  # Replace 0 with NaN for unrated
    similarity_matrix=movie_similarity_df,
    k=10
)
print(predicted_rating_item)

4.0
