# Recommendation System

---

In [1]:
# to auto reload any updated py files
%load_ext autoreload
%autoreload 2

1. Download dataset

In [2]:
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -q ml-100k.zip

--2025-06-18 01:37:15--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2025-06-18 01:37:16 (4.79 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [3]:
%%writefile data_loader.py

# load ratings & movie data
import pandas as pd


def load_ratings(path="ml-100k/u.data"):
  '''load file with user ratings'''
  return pd.read_csv(path, sep='\t', header=None,
                     names=["user_id", "movie_id", "rating", "timestamp"])

def load_movies(path='ml-100k/u.item'):
  '''load file with movie metadata'''
  return pd.read_csv(path, sep='|', encoding='latin-1', header=None,
                     names=["movie_id", "title", "release_date", "video_release_date",
                            "IMDb_URL"] + [f"genre_{i}" for i in range(19)])

def build_user_item_matrix(ratings_df):
  ''' pivot to user-item matrix with NaNs for missing values'''
  return ratings_df.pivot_table(index='user_id', columns='movie_id', values='rating')

def fill_missing_zero(matrix):
  '''filling NaNs with 0 for cosine similarity'''
  return matrix.fillna(0)

def center_ratings(matrix):
  '''returning mean centered ratings matrix (for pearson similarity)'''
  user_means = matrix.mean(axis=1)
  return matrix.sub(user_means, axis=0), user_means


Writing data_loader.py


In [4]:

#usage reference
from data_loader import load_ratings, load_movies, build_user_item_matrix, fill_missing_zero, center_ratings

ratings = load_ratings()                        #  raw user-movie-rating data
movies = load_movies()                          #  movie metadata (titles, etc.)
user_item = build_user_item_matrix(ratings)     #  sparse matrix [users x movies] of user ratings
user_item_filled = fill_missing_zero(user_item) # fill with 0s (used for cosine similarity)
user_item_centered, user_means = center_ratings(user_item)  # centered matrix (used for Pearson)

item_means = user_item.mean(axis=0)  # item means - avg rating each movie's column, how well movie was rated overall (used for item based Pearson)



In [5]:
# creating similarity.py

%%writefile similarity.py

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(matrix):
  ''' Computing the cosine similarity between users based on rating vectors.
      The input is a matrix with users as rows, movies as columns and no NaNs (filled with zeroes)'''

  similarity = cosine_similarity(matrix.values)
  return pd.DataFrame(similarity, index=matrix.index, columns=matrix.index)

def compute_pearson_similarity(centered_matrix):
    '''Computes the Pearson correlation between users on mean centered data..
       The input is a matrix with mean-centered ratings (NaNs allowed)'''
    return centered_matrix.T.corr(method='pearson')


def get_top_k_neighbors(similarity_matrix, user_id, k=5):
  ''' Get top k most similar users to a given user, excluding themselves.'''
  user_similarities = similarity_matrix.loc[user_id]
  top_k = user_similarities.drop(index=user_id).nlargest(k)
  return top_k


Writing similarity.py


In [6]:
# usage reference

from similarity import compute_cosine_similarity, compute_pearson_similarity, get_top_k_neighbors

# user cosine similarity
user_similarity_cosine = compute_cosine_similarity(user_item_filled)

# user pearson similarity
user_similarity_pearson = compute_pearson_similarity(user_item_centered)

# using pearson to get similar users to a certain other user
top_users = get_top_k_neighbors(user_similarity_pearson, user_id=1, k=5)

In [35]:
# creating predictor.py
%%writefile predictor.py

import numpy as np

def predict_rating_cosine(user_id, movie_id, rating_matrix, similarity_matrix):
  ''' preduct a user's rating for a movie'''
  if movie_id not in rating_matrix.columns:
    return np.nan

  movie_ratings = rating_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings > 0].index

  if user_id not in similarity_matrix.index:
    return np.nan


  similarities = similarity_matrix.loc[user_id, rated_users]
  ratings = movie_ratings[rated_users]

  numerator = np.dot(similarities, ratings)
  denominator = np.sum(np.abs(similarities))

  return numerator / denominator if denominator != 0 else np.nan


def predict_rating_pearson(user_id, movie_id, centered_matrix, similarity_matrix, user_means):
  ''' predict user's rating fora  movie using pearson & centered matrix'''
  if movie_id not in centered_matrix.columns or user_id not in similarity_matrix.index:
    return np.nan

  movie_ratings = centered_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings.notna()].index
  similarities = similarity_matrix.loc[user_id, rated_users]
  ratings = centered_matrix.loc[rated_users, movie_id]

  #filter out NaNs
  valid_mask = ratings.notna() & similarities.notna()
  similarities = similarities[valid_mask]
  ratings = ratings[valid_mask]

  #check denominator
  denominator = np.sum(np.abs(similarities))
  if len(similarities) ==0 or denominator == 0:
    return np.nan

  numerator = np.dot(similarities, ratings)
  return user_means.loc[user_id] + (numerator / denominator)


# addig top k neighbor filtering
def get_top_k_similar_users(user_id, similarity_matrix, k=10, min_similarity=0.0):
  ''' returning the top k most similar users to the target user'''
  if user_id not in similarity_matrix.index:
    return []

  similarities = similarity_matrix.loc[user_id].drop(user_id)
  similarities = similarities[similarities >= min_similarity]
  top_k = similarities.sort_values(ascending=False).head(k)
  return top_k.index

#predict rating top k cosine
def predict_rating_top_k_cosine(user_id, movie_id, rating_matrix, similarity_matrix, k=10, min_similarity=0.0):
  ''' predict rating using cosine similarity and top-k neighbors'''
  if movie_id not in rating_matrix.columns or user_id not in similarity_matrix.index:
    return np.nan

  movie_ratings = rating_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings > 0].index

  # finding overlap between rated users and top k similar ones
  top_k_users = get_top_k_similar_users(user_id, similarity_matrix, k, min_similarity)
  neighbors = [u for u in top_k_users if u in rated_users]

  if not neighbors:
    return np.nan

  similarities = similarity_matrix.loc[user_id, neighbors]
  ratings = rating_matrix.loc[neighbors, movie_id]

  numerator = np.dot(similarities, ratings)
  denominator = np.sum(np.abs(similarities))

  return numerator / denominator if denominator != 0 else np.nan


#predict rating top k pearson
def predict_rating_top_k_pearson(user_id, movie_id, centered_matrix, similarity_matrix, user_means, k=10, min_similarity=0.0):
  '''predict using pearson & top k neighbors'''
  if movie_id not in centered_matrix.columns or user_id not in similarity_matrix.index:
    return np.nan

  movie_ratings = centered_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings.notna()].index

  top_k_users = get_top_k_similar_users(user_id, similarity_matrix, k, min_similarity)
  neighbors = [u for u in top_k_users if u in rated_users]

  if not neighbors:
    return np.nan

  similarities = similarity_matrix.loc[user_id, neighbors]
  ratings = centered_matrix.loc[neighbors, movie_id]

  valid_mask = ratings.notna() & similarities.notna()
  similarities = similarities[valid_mask]
  ratings = ratings[valid_mask]

  denominator = np.sum(np.abs(similarities))
  if len(similarities) == 0 or denominator == 0:
    return np.nan

  numerator = np.dot(similarities,ratings)
  return user_means.loc[user_id] + (numerator / denominator)


# predict rating item-item CF cosine
def predict_rating_item_cosine(user_id, movie_id, rating_matrix, item_similarity_matrix):
  '''predict rating for a user/movie using item-item cos similarity'''
  if user_id not in rating_matrix.index or movie_id not in item_similarity_matrix.index:
    return np.nan

  user_ratings = rating_matrix.loc[user_id]
  rated_items = user_ratings[user_ratings > 0].index

  similarities = item_similarity_matrix.loc[movie_id, rated_items]
  ratings = user_ratings[rated_items]

  numerator = np.dot(similarities, ratings)
  denominator = np.sum(np.abs(similarities))

  return numerator / denominator if denominator != 0 else np.nan

# predict rating item CF Pearson
def predict_rating_item_pearson(user_id, movie_id, centered_matrix, item_similarity_matrix, item_means):
  '''predict rating using item-item pearson sim & centered ratings'''
  if user_id not in centered_matrix.index or movie_id not in item_similarity_matrix.index:
    return np.nan

  user_ratings = centered_matrix.loc[user_id]
  rated_items = user_ratings[user_ratings.notna()].index

  similarities = item_similarity_matrix.loc[movie_id, rated_items]
  ratings = centered_matrix.loc[user_id, rated_items]

  valid_mask = ratings.notna() & similarities.notna()
  similarities = similarities[valid_mask]
  ratings = ratings[valid_mask]

  denominator = np.sum(np.abs(similarities))
  if len(similarities) == 0 or denominator == 0:
    return np.nan

  numerator = np.dot(similarities, ratings)
  return item_means.loc[movie_id] + (numerator / denominator)

#item based CF functions
# get top k similar items
def get_top_k_similar_items(movie_id, item_similarity_matrix, k=10, min_similarity=0.0):
  '''return top kmost similar items (movies) to the target movie'''
  if movie_id not in item_similarity_matrix.index:
    return[]

  similarities = item_similarity_matrix.loc[movie_id].drop(movie_id)
  similarities = similarities[similarities>= min_similarity]
  top_k = similarities.sort_values(ascending=False).head(k)

  return top_k.index


def predict_rating_top_k_item_cosine(user_id, movie_id, rating_matrix, item_similarity_matrix, k=10, min_similarity=0.0):
  '''predict rating using item-item cosine similarity & top-k filtering'''
  if user_id not in rating_matrix.index or movie_id not in item_similarity_matrix.index :
    return np.nan

  user_ratings = rating_matrix.loc[user_id]
  rated_items = user_ratings[user_ratings > 0 ].index

  top_k_items= get_top_k_similar_items(movie_id,item_similarity_matrix, k, min_similarity)
  neighbors = [item for item in top_k_items if item in rated_items]

  if not neighbors:
    return np.nan

  similarities = item_similarity_matrix.loc[movie_id, neighbors]
  ratings = user_ratings[neighbors]

  numerator = np.dot(similarities, ratings)
  denominator = np.sum(np.abs(similarities))

  return numerator / denominator if denominator != 0 else np.nan

def predict_rating_top_k_item_pearson(user_id, movie_id, centered_matrix, item_similarity_matrix, item_means, k=10, min_similarity=0.0):
  ''' predict rating using item-item Pearson sim. with top k filtering'''
  if user_id not in centered_matrix.index or movie_id not in item_similarity_matrix.index:
    return np.nan

  user_ratings = centered_matrix.loc[user_id]
  rated_items= user_ratings[user_ratings.notna()].index



  top_k_items= get_top_k_similar_items(movie_id, item_similarity_matrix, k, min_similarity)
  neighbors = [item for item in top_k_items if item in rated_items]

  if not neighbors:
    return np.nan

  similarities = item_similarity_matrix.loc[movie_id, neighbors]
  ratings = user_ratings[neighbors]

  valid_mask = ratings.notna() & similarities.notna()
  similarities = similarities[valid_mask]
  ratings = ratings[valid_mask]

  denominator = np.sum(np.abs(similarities))
  if len(similarities) == 0 or denominator == 0 :
    return np.nan

  numerator = np.dot(similarities, ratings)
  return item_means.loc[movie_id] + (numerator / denominator)

Overwriting predictor.py


In [8]:
# update predictor with top k
from predictor import predict_rating_top_k_cosine, predict_rating_top_k_pearson


topk_cosine_pred = predict_rating_top_k_cosine(
    user_id=1,
    movie_id=50,
    rating_matrix= user_item_filled,
    similarity_matrix= user_similarity_cosine,
    k=30
)

topk_pearson_pred = predict_rating_top_k_pearson(
    user_id=1,
    movie_id=50,
    centered_matrix= user_item_centered,
    similarity_matrix= user_similarity_pearson,
    user_means=user_means,
    k=30
)

print(f"top k Cosine Prediction: {topk_cosine_pred: .2f}")
print(f"top kPearson prediction: {topk_pearson_pred:.2f}")


top k Cosine Prediction:  4.74
top kPearson prediction: 4.10


In [9]:
# after implementing top k predictions:
# the results now show a refined memory based collaborative filtering
# by using only the top-k most similar users(instead of using all of them)
# making prediction less noisy and more realistic than the previous one

# get top k similar users - selects k most sim. users to targer user(using cos or pears)
# with similarity threshold

# predict_rating_top_k - uses solely those top-k users when predicting a rating, avoiding. weak/noisy similarities.


#full cosine similarity - 4.40
#full pearson similarity - 4.48
# including all users even noisy/unrelated ones

#top k cosine (k=30) - 4.74| recommendation is more confident
#top k pearson (k=30) - 4.10| rec is more conservative

#metrics. such as Precision@K and Recall@K
# precision@ k - from K recommended movies, how many were actually liked?
# recall@ k - of all movies that the user liked, how many did we recommend in top K?

#other model based CF approaches for ranking instead of rating prediction

# BPR bayesian personalized ranking - pairwise learning to rank
#for a given user u, if they liked item i,
#they should prefer i over some item j they didn’t interact with.
#Maximize probability that i ≻ j.
#is better for clicks, views (lightFM, Implicit)

#ALS Alternating least squares
# matrix factorization method
# supports explicit & implicit feedback (Spark MLlib. implicit, surprise)

#NN Based neural recommenders
#learn complex non linear interactions between user/items
# NeuMF, NCF, AutoRec, DeepFM
# use of transformers for sequence based rec


In [10]:
#adding top N recommendation generation

%%writefile topn.py

def recommend_top_n(user_id, rating_matrix, similarity_matrix, n=10, k=30, min_similarity=0.0):
  '''recommend top N items based on top K similar users'''
  if user_id not in similarity_matrix.index:
    return []

  # getting top k similar users
  sim_scores = similarity_matrix.loc[user_id].drop(user_id)
  top_k_users = sim_scores[sim_scores >= min_similarity].nlargest(k).index

  #items that the targer user has already rated
  user_rated_items = set(rating_matrix.loc[user_id][rating_matrix.loc[user_id] > 0].index)

  #items rated by top k users but not by the target user
  candidate_items = set()
  for neighbor in top_k_users:
    neighbor_rated = rating_matrix.loc[neighbor][rating_matrix.loc[neighbor] > 0 ].index
    candidate_items.update(neighbor_rated)

  candidate_items.difference_update(user_rated_items)

  #predict ratings for candidate items
  predictions = {}
  for item in candidate_items:
    numer, denom = 0.0, 0.0
    for neighbor in top_k_users:
      sim = similarity_matrix.at[user_id, neighbor]
      neighbor_rating = rating_matrix.at[neighbor, item] if item in rating_matrix.columns else 0

      if neighbor_rating >0:
        numer += sim* neighbor_rating
        denom += abs(sim)

    if denom>0:
      predictions[item] = numer / denom


  # return top N highest predicted ratings
  top_n_items = sorted(predictions.items(), key=lambda x: x[1], reverse= True)[:n]
  return top_n_items

Writing topn.py


In [11]:
# usage reference for topn

from topn import recommend_top_n

#generate top 5 recommended movie_ids for user 1 using cos similarity
top_5cosine = recommend_top_n(
    user_id=1,
    rating_matrix=user_item_filled,
    similarity_matrix = user_similarity_cosine,
    n=5,
    k=30,
    min_similarity=0.1
)

print("top 5 Cosine recommended")
print(top_5cosine)


#.generate top 5 rec movie_ids for user 1 using pears similarity
top_5pearson = recommend_top_n(
    user_id=1,
    rating_matrix=user_item_filled,
    similarity_matrix=user_similarity_pearson,
    n=5,
    k=30,
    min_similarity=0.1
)

print("top 5 Pearson")
print(top_5pearson)

top 5 Cosine recommended
[(690, np.float64(5.000000000000001)), (522, np.float64(5.0)), (641, np.float64(5.0)), (853, np.float64(5.0)), (1111, np.float64(5.0))]
top 5 Pearson
[(524, np.float64(5.0)), (1048, np.float64(5.0)), (603, np.float64(5.0)), (604, np.float64(5.0)), (650, np.float64(5.0))]


In [12]:
# showing movie titles from top_5cosine
top_movie_ids = [movie_id for movie_id, _ in top_5cosine]
recommended_titles = movies[movies['movie_id'].isin(top_movie_ids)][['movie_id', 'title']]

print("Top. 5 Cosine N recommendation for user 1")
print(recommended_titles)


Top. 5 Cosine N recommendation for user 1
      movie_id                        title
521        522           Down by Law (1986)
640        641        Paths of Glory (1957)
689        690  Seven Years in Tibet (1997)
852        853             Braindead (1992)
1110      1111      Double Happiness (1994)


In [13]:
# movie titles form top_5pearson
top_movie_ids = [movie_id for movie_id, _ in top_5pearson]
recommended_titles = movies[movies['movie_id'].isin(top_movie_ids)][['movie_id', 'title']]


print("Top. 5 Pearson N recommendation for user 1")
print(recommended_titles)

Top. 5 Pearson N recommendation for user 1
      movie_id                                            title
523        524                       Great Dictator, The (1940)
602        603                               Rear Window (1954)
603        604                     It Happened One Night (1934)
649        650  Seventh Seal, The (Sjunde inseglet, Det) (1957)
1047      1048                             She's the One (1996)


In [14]:
# top k filtering ensures only top 5 similar users are considered
# top n recommendations are about ranking items (not just predicting ratings)

#cos sim is angle based. | movies popular among similar-rating users
#pear sim is mean centered / correlation based | movies w similar rating patterns, not just high ratings

#eg cos sim| 690 and 641 may have had similar fan bases to the user
#eg pear sim| 603 and 604 are critically acclaimed classics

In [37]:
# creating evaluator.py

# with extra functions for ranking evaluation

%%writefile evaluator.py

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict

def train_test_split(ratings_df, test_size=0.2, seed=42):
    """
    splitting ratings into train/test sets (randomly)
    """
    np.random.seed(seed)
    shuffled = ratings_df.sample(frac=1, random_state=seed)
    test_count = int(len(shuffled) * test_size)


    test_df = shuffled.iloc[:test_count]
    train_df = shuffled.iloc[test_count:]

    return train_df, test_df

def evaluate(predict_fn, test_df, *predict_args):
  ''' eval prediction function on test set '''

  y_true = []
  y_pred = []

  for row in test_df.itertuples():
    pred = predict_fn(row.user_id, row.movie_id, *predict_args)
    if not np.isnan(pred):
      y_true.append(row.rating)
      y_pred.append(pred)

  rmse= np.sqrt(mean_squared_error(y_true, y_pred))
  mae = mean_absolute_error(y_true, y_pred)
  return rmse, mae


def get_top_n_recommendations(test_df, rating_matrix, similarity_matrix, recommend_fn, n=5, k=20, min_similarity=0.0):
  ''' for each user in test_df, get top n recs.'''
  user_recs = defaultdict(list)
  users = test_df['user_id'].unique()

  for user_id in users:
    recs = recommend_fn(
        user_id = user_id,
        rating_matrix = rating_matrix,
        similarity_matrix = similarity_matrix,
        n=n,
        k=k,
        min_similarity = min_similarity
    )
    if recs:
      recommended_items = [item for item, _ in recs]
      user_recs[user_id] = recommended_items
  return user_recs

def precision_recall_at_k(user_recs, test_df, k=5):
  '''computes precision@k & recall@k w/ ground truth from test_df'''
  relevant = defaultdict(set)
  for row in test_df.itertuples():
    if row.rating >= 4:
      relevant[row.user_id].add(row.movie_id)

  precisions, recalls = [], []

  for user_id, recommended_items in user_recs.items():
    true_items = relevant.get(user_id, set())
    if not true_items:
      continue

    recommended_top_k = set(recommended_items[:k])
    n_rel_and_rec = len(recommended_top_k & true_items)

    precision = n_rel_and_rec / k
    recall = n_rel_and_rec / len(true_items)

    precisions.append(precision)
    recalls.append(recall)

  avg_precision = np.mean(precisions)
  avg_recall =np.mean(recalls)
  return avg_precision, avg_recall

def evaluate_predictions_item_based(predict_fn, test_df, rating_matrix=None, similarity_matrix_or_means=None, centered_matrix=None, **kwargs):
  ''' generalized eval for item based CF (cos or pear)
      -'similarity_matrix_or_means' can vary:
      * for cosine: similarity_matrix
      * for pearson: (centered_matrix, item_means)'''
  y_true = []
  y_pred = []

  is_pearson = isinstance(similarity_matrix_or_means, tuple)
  if is_pearson:
      item_similarity_matrix, item_means = similarity_matrix_or_means

  for row in test_df.itertuples():
      try:
          if is_pearson:
              pred = predict_fn(
                  row.user_id, row.movie_id,
                  centered_matrix,
                  item_similarity_matrix,
                  item_means,
                  **kwargs
              )
          else:
              pred = predict_fn(
                  row.user_id, row.movie_id,
                  rating_matrix,
                  similarity_matrix_or_means,
                  **kwargs
                )

          if not np.isnan(pred):
              y_true.append(row.rating)
              y_pred.append(pred)
      except Exception as e:
          print(f"Prediction error for user {row.user_id}, movie {row.movie_id}: {e}")

  rmse = np.sqrt(mean_squared_error(y_true, y_pred))
  mae = mean_absolute_error(y_true, y_pred)

  return rmse, mae , len(y_pred)

Overwriting evaluator.py


In [16]:
# usage reference
from data_loader import build_user_item_matrix, fill_missing_zero, center_ratings
from similarity import compute_cosine_similarity, compute_pearson_similarity
from predictor import predict_rating_cosine, predict_rating_pearson
from evaluator import train_test_split, evaluate

In [17]:
# split the original ratings
train_df, test_df = train_test_split(ratings)

# build matrices from training data only
train_user_item = build_user_item_matrix(train_df)

train_user_item_filled = fill_missing_zero(train_user_item)

train_user_item_centered, train_user_means = center_ratings(train_user_item)

# get similarities from training data
user_sim_cosine = compute_cosine_similarity(train_user_item_filled)

user_sim_pearson = compute_pearson_similarity(train_user_item_centered)


In [18]:
# evaluate cosine

rmse_cos, mae_cos = evaluate(
    predict_rating_top_k_cosine,
    test_df,
    train_user_item_filled,
    user_sim_cosine
)

print(f"Cosine Top K RMSE: {rmse_cos:4f}, Cosine Top K MAE: {mae_cos:4f}")


Cosine Top K RMSE: 1.107655, Cosine Top K MAE: 0.860426


In [19]:
# evaluate pearson

rmse_pear, mae_pear = evaluate(
    predict_rating_top_k_pearson,
    test_df,
    train_user_item_centered,
    user_sim_pearson,
    train_user_means
)

print(f"Pearson Top K RMSE: {rmse_pear:4f}, Pearson Top K MAE:{mae_pear:4f}")

Pearson Top K RMSE: 1.277526, Pearson Top K MAE:1.001483


In [20]:
# evaluate but now with top-N ranking evaluation
from evaluator import get_top_n_recommendations, precision_recall_at_k

top_n_recs = get_top_n_recommendations(
    test_df = test_df,
    rating_matrix = train_user_item_filled,
    similarity_matrix = user_sim_cosine,
    recommend_fn = recommend_top_n, #from topn.py
    n=5,
    k=30,
    min_similarity = 0.2
)

prcsn_5, rcll_5 = precision_recall_at_k(top_n_recs,  test_df, k=5)
print(f"Precision@5 : {prcsn_5:.4f},  Recall@5 : {rcll_5:.4f}")


Precision@5 : 0.0054,  Recall@5 : 0.0040


In [21]:
# Review of results

# Precision@5 : 0.0054,
# meaning ~3 out of 5 recommended movies were already rated highly by the user, based on existing test data

# Recall@5 : 0.0040
# meaning only %40 of liked movies were retrieved in the top 5 predictions, the rest were not.


# SUMMARY
# precision@k - % of recommneded movies that are actually relevant, measures how many in top k are known to be relevatn based on existing ratings
# recall@k - % of relevant movies that were actually recommended, measures how many of the relevant(liked) movies were found in the top-k
# "relevant" here - user gave a high rating, usually 4>=
# .@k - number of novies recommended, .@5 = we are evaluating the top 5 predicted movies


In [22]:
############## OVERALL SUMMARY OF PROJECT ###############

# Collaborative Filtering:
# User behavior based, not based in attributes of items/products
# eg. If users A & B liked the same things in the past, they might like similar things in future

#=========================================
# MEMORY BASED CF / DONE HERE

# user-based CF -> Cosine Similarity -> Predict a rating based on similar users' ratings (angle betweem vectors)
# user-based CF -> Pearson Similarity -> Adjust for user biases by centering ratings (linear relationship between variables)
# Top K Filtering -> Cosine / Pearson (Top K) -> Only use the top K similar users to avoid noise
# RMSE / MAE - For accuracy of predicted ratings
# Precision@K and Recall@K - Accuracy of recommendation lists (used in top N recommendation evaluation)


# Not Done here yet:
# Item-Based CF - Use similar movies instead of similar users
# Significance Weighting - Weight down similarities with fewer co-rated items.
#=================================================



# Up ahead...
#==========================================
# MODEL BASED CF:
# Learn latent factors via optimization (matrix factorization), big systems like netflix use

# Most common types of ModeL Based CF
# SVD / Matrix Factorization -> Learns user/item vectors that explain ratings
# Alternating Least Squares (ALS) -> Optimization-based matrix factorization (used at scale)
# Deep Learning (Autoencoders) -> Learns Patterns in sparse rating data
# Neural Collaborative Filtering -> Uses embeddings & neural nets for scoring

#==========================================


# Further ahead... REAL LIFE USAGES
#================================================
# Big companies use:
# NDCG@K (Normalized Discounted Cumulative Gain) -> Rewards correct ranking in top-N
# MAP@K (Mean Average Precision) -> Measures how early relevant items appear
# Hit Rate@K -> Measure whther any relevant item appears in top-K
# Coverage -> % of items recommended at least once
# Diversity / Novelty / Serendipity - > Encourage unexpected but relevant recs
# Business KPIs -> Watch time, click-through, purchase rate, etc.

# These ALL often optimize MULTIPLE OBJECTIVES at once, not just prediction error.
#==================================================


In [23]:
# Item Based Collaborative Filtering (ITEM-ITEM CF)

%%writefile item_similarity.py

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def compute_item_similarity_cosine(user_item_matrix):
  ''' cosine similarity between items (columns)'''
  item_vectors = user_item_matrix.T # transposes columns as rows
  similarity = cosine_similarity(item_vectors)
  return pd.DataFrame(similarity, index=item_vectors.index, columns=item_vectors.index)

def compute_item_similarity_pearson(user_item_centered):
  '''pearson corelation between items'''
  return user_item_centered.corr(method='pearson')


Writing item_similarity.py


In [24]:
# usage reference for item_similarity.py
#now comparing columns (items) instead of rows (users)

from item_similarity import compute_item_similarity_cosine, compute_item_similarity_pearson

#get similarity matrices
item_similarity_cosine = compute_item_similarity_cosine(user_item_filled)
item_similarity_pearson = compute_item_similarity_pearson(user_item_centered)

# example
item_similarity_cosine.loc[50].sort_values(ascending=False).head(10)
item_similarity_pearson.loc[50].sort_values(ascending=False).head(10)


Unnamed: 0_level_0,50
movie_id,Unnamed: 1_level_1
788,1.0
1242,1.0
784,1.0
50,1.0
1542,1.0
1497,1.0
1189,1.0
766,1.0
1523,1.0
1549,1.0


In [25]:
# prediction using item-item based CF

from predictor import predict_rating_item_cosine, predict_rating_item_pearson

cosine_pred_item = predict_rating_item_cosine(
    user_id=1,
    movie_id=50,
    rating_matrix=user_item_filled,
    item_similarity_matrix=item_similarity_cosine
)

pearson_pred_item = predict_rating_item_pearson(
    user_id=1,
    movie_id=50,
    centered_matrix=user_item_centered.T, #transpose to get colums as rows
    item_similarity_matrix=item_similarity_pearson,
    item_means=item_means
)

print(f"Item-based Prediction (Cosine): {cosine_pred_item:.2f}")
print(f"Item-based Prediction (Pearson): {pearson_pred_item:.2f}")

Item-based Prediction (Cosine): 3.79
Item-based Prediction (Pearson): 4.36


In [26]:
# Result analysis for item based predictions:

#Item-based Prediction (Cosine): 3.79
# Based on items similar to movie_id 50 that user_id 1 has rated...
# the model predicts the user would rate it ~3.79
# Cosine here rates patterns without centering, influenced by magnitude
# magnitude (eg user who rate high in general)

#Item-based Prediction (Pearson): 4.36
# Based on other movies ahtat user 1 has rated & based on how similar these ( movies user has rated)
# movies are to movie 50, model says user would rate 4.36
# strong positive correlations between movies user 1 liked & target movie
# this centers each item's ratings, removing global popularity bias

# it just expects the user to give an aproximate rating, it does not have to be as accurate as model says

In [27]:
from data_loader import build_user_item_matrix, fill_missing_zero, center_ratings

train_ratings, test_ratings = train_test_split(ratings)

# user-item matrix from train set
user_item_train = build_user_item_matrix(train_ratings)
user_item_filled_train = fill_missing_zero(user_item_train)
user_item_centered_train, user_means_train = center_ratings(user_item_train)
item_means_train = user_item_train.mean(axis=0)


In [28]:
# get movies that appear in the training set
train_movie_ids = train_ratings['movie_id'].unique()
train_user_ids = train_ratings['user_id'].unique()

# subset the centered matrix to only include movies & users from training
user_item_centered_train = user_item_centered.loc[train_user_ids, train_movie_ids]

# compute item-item similarity only among training movies
item_similarity_pearson = user_item_centered_train.T.corr(method='pearson')


In [29]:
# Evaluating Item-item based CF (Cosine & Pearson)
# rmse & mae
from evaluator import evaluate_predictions_item_based
from predictor import predict_rating_item_cosine, predict_rating_item_pearson

item_cos_rmse, item_cos_mae,  item_cos_n_pred  = evaluate_predictions_item_based(
    predict_fn=predict_rating_item_cosine,
    test_df=test_ratings,
    rating_matrix=user_item_filled,
    similarity_matrix_or_means=item_similarity_cosine
)

item_pearson_rmse, item_pearson_mae,  item_pearson_n_pred  = evaluate_predictions_item_based(
    predict_fn=predict_rating_item_pearson,
    test_df=test_ratings,
    centered_matrix=user_item_centered,
    similarity_matrix_or_means=(item_similarity_pearson, item_means)
)


# coverage
cos_coverage = item_cos_n_pred / len(test_ratings)
pearson_coverage = item_pearson_n_pred / len(test_ratings)

print(f"Item-Cosine RMSE: {item_cos_rmse:.4f}, MAE: {item_cos_mae:.4f}, Cosine Coverage: {cos_coverage:.2%}")
print(f"Item-Pearson RMSE: {item_pearson_rmse:.4f}, MAE: {item_pearson_mae:.4f}, Pearson Coverage: {pearson_coverage:.2%}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Prediction error for user 834, movie 751: '[1017] not in index'
Prediction error for user 551, movie 98: '[944, 950, 955, 959, 975, 979, 991, 1011, 1028, 1035, 1039, 1044, 1047, 1051, 1059, 1067, 1079, 1087, 1118, 1135, 1136, 1139, 1169, 1207, 1217, 1220, 1253, 1267, 1303, 1304, 1314, 1376, 1419, 1439, 1443, 1518, 1621] not in index'
Prediction error for user 934, movie 56: '[949, 961, 963, 965, 972, 1018, 1037, 1065, 1135, 1203, 1285, 1311, 1411, 1425, 1449] not in index'
Prediction error for user 868, movie 61: '[946, 998, 1028, 1031, 1035, 1037, 1076, 1098, 1183, 1188, 1206, 1240, 1285, 1480, 1509] not in index'
Prediction error for user 600, movie 29: '[947, 1004, 1110, 1188, 1228, 1231, 1239, 1274, 1407, 1419] not in index'
Prediction error for user 405, movie 69: '[944, 946, 947, 949, 951, 953, 954, 955, 956, 957, 958, 959, 960, 964, 969, 970, 971, 972, 994, 996, 997, 999, 1004, 1005, 1006, 1018, 1019, 1021, 1027, 1

In [30]:
# todo analyze results from item based CF RMSE & MAE

# The error messages in the ouput show that the movie id in the test set does not exist
# in the item similarity matrix, rows were skipped to maintain fairness

#Prediction error for user 650, movie 612: '[968, ..., 1474, 1627] not in index'
# When trying to predict how user 650 would rate movie 612
# the model tried to look up similar movies to 612 (item based filtering)
# the movies are all lovated inside the list, but these movies are NOT in the item_sim_matrix
# or in the relevant matrix used to pull similarity data, but why?

# Movies that user 650 rated are not in the training set, or
# or they were filtered out during matrix construction due to missing data.

# Possibilities:
# Yes, user 650 did rate those movies in the original dataset (the full ratings_df)
# But those movies were:
# Not present in the item_similarity_matrix, because:
# They weren’t in the training set (they were filtered out)
# Or they had too few ratings, causing NaN correlations during .corr(), so they got dropped


#COSINE RMSE & MAE
#Item-Cosine RMSE: 0.9747, MAE: 0.7769
#-> Predicted RMSE ratings are ~0.97 stars off
# meaning it rates 3 to a 4 star movie, basically one star off.

#-> Predicted MAE ratings are 0~0.77 stars off
# meaning it rates a movie 2.3 stars instead of 3 stars

#-> Coverage 100% Meaning it makes predictions for every rating in the test set



# PEARSON RMSE & MAE
#Item-Pearson RMSE: 0.9776, MAE: 0.7854
# The same reasoning applied for this (very similar metrics compared to Cosine)

#-> Coverage 6.59% - Uses centered ratings which introduces more NaNs which difficult similarity computations
# (no rating takes you to no mean substracted value and cant compute similarity)
# makes predictions for only 6.59% of test ratings, VERY LOW COVERAGE

In [38]:
# item-item CF predictor with top k

from predictor import predict_rating_top_k_item_cosine, predict_rating_top_k_item_pearson

evaluate_predictions_item_based(
    predict_fn = predict_rating_top_k_item_cosine,
    test_df = test_ratings,
    rating_matrix = user_item_filled,
    similarity_matrix_or_means = item_similarity_cosine,
    k=20,
    min_similarity= 0.0
)

(np.float64(0.9578721650236994), 0.7385015501172574, 19793)

In [39]:
# item-item CF Top K Cosine Evaluation
from predictor import predict_rating_top_k_item_cosine
from evaluator import evaluate_predictions_item_based

item_topk_cos_rmse, item_topk_cos_mae, item_topk_cos_n_pred =evaluate_predictions_item_based(
    predict_fn=lambda u, m,mat, sim: predict_rating_top_k_item_cosine(
        u, m, mat, sim, k=20, min_similarity=0.0),
    test_df = test_ratings,
    rating_matrix =user_item_filled,
    similarity_matrix_or_means=item_similarity_cosine
)

item_topk_cos_coverage = item_topk_cos_n_pred / len(test_ratings)
print(f"Top-K item-Cosine RMSE: {item_topk_cos_rmse:.4f}, MAE: {item_topk_cos_mae:.4f}, Coverage: {item_topk_cos_coverage:.4%}")


Top-K item-Cosine RMSE: 0.9579, MAE: 0.7385, Coverage: 98.9650%


In [48]:
# item-item CF Top K Pearson Evaluation
from predictor import predict_rating_top_k_item_pearson
from evaluator import evaluate_predictions_item_based

item_topk_pearson_rmse, item_topk_pearson_mae,item_topk_pearson_n_pred = evaluate_predictions_item_based(
    predict_fn=lambda u, m, _, cm, sim_and_mean, **kwargs: predict_rating_top_k_item_pearson(
      u, m, cm, sim_and_mean[0], sim_and_mean[1], **kwargs),
    test_df=test_ratings,
    centered_matrix= user_item_centered,
    similarity_matrix_or_means = (item_similarity_pearson, item_means),
    k=20,
    min_similarity=0.0
)

item_topk_pearson_coverage = item_topk_pearson_n_pred/ len(test_ratings)
print(f"Top-K Item-Pearson RMSE: {item_topk_pearson_rmse:.4f}, MAE: {item_topk_pearson_mae:.4f}, Coverage: {item_topk_pearson_coverage:.2%}")

Prediction error for user 877, movie 381: 0
Prediction error for user 815, movie 602: 0
Prediction error for user 94, movie 431: 0
Prediction error for user 416, movie 875: 0
Prediction error for user 500, movie 182: 0
Prediction error for user 259, movie 1074: 0
Prediction error for user 598, movie 286: 0
Prediction error for user 886, movie 496: 0
Prediction error for user 837, movie 15: 0
Prediction error for user 521, movie 184: 0
Prediction error for user 459, movie 864: 0
Prediction error for user 622, movie 568: 0
Prediction error for user 655, movie 1197: 0
Prediction error for user 128, movie 99: 0
Prediction error for user 308, movie 31: 0
Prediction error for user 930, movie 286: 0
Prediction error for user 43, movie 14: 0
Prediction error for user 42, movie 176: 0
Prediction error for user 450, movie 618: 0
Prediction error for user 521, movie 568: 0
Prediction error for user 698, movie 176: 0
Prediction error for user 13, movie 476: 0
Prediction error for user 796, movie 8

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.